diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/MoRA/README.md b/MoRA/README.md new file mode 100644 index 0000000000000000000000000000000000000000..304b0570eee175015a38d05a10f251068b034154 --- /dev/null +++ b/MoRA/README.md @@ -0,0 +1,68 @@ +# [MoRA: High-Rank Updating for Parameter-Efficient Fine-Tuning](https://arxiv.org/abs/2405.12130) + +## Setup + +We implement MoRA in peft-mora based on HF peft in the [`apply_mora`](https://github.com/kongds/MoRA/blob/main/peft-mora/src/peft/tuners/lora/layer.py#L229) and [`get_delta_weight`](https://github.com/kongds/MoRA/blob/main/peft-mora/src/peft/tuners/lora/layer.py#L514). +``` sh +pip install -e ./peft-mora +``` + +After installation, it can be used like + +``` python +from peft import LoraConfig, get_peft_model +config = LoraConfig( + # enable MoRA + use_mora=True, + # type 1 (Sharing) for large lora ranks, Eq. 6 in paper + # type 6 (RoPE based) for small lora ranks, Eq. 9 in paper + mora_type=6, + # lora rank here, we will calculate corresponding $\hat{r}$ in MoRA + r=lora_r, + # MoRA does not use lora_alpha + # lora_alpha=lora_alpha, + target_modules=lora_target_modules, + lora_dropout=lora_dropout, + task_type="CAUSAL_LM", + **kwargs, +) +model = get_peft_model(model, config) + +# training here... + +# can be merged into model via `merge_and_unload` like LoRA +model = model.merge_and_unload() +``` + +## Examples +### fine-tuning MetaMath with MoRA + +``` sh +RANK=8 +deepspeed --num_gpus=8 --num_nodes=2 train.py \ + --base_model --micro_batch_size 4\ + --wandb_run_name mora_math_r8 --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,down_proj,up_proj \ + --num_epochs 3 --deepspeed ds.config --wandb_project lora-math --lora_r $RANK --batch_size 128 \ + --data_path meta-math/MetaMath \ + --save_steps 3000 \ + --learning_rate 3e-4 --mora_type 6 \ + --logging_steps 5 --use_bf16 --use_16bit --use_mora +``` + +### pretraining + +``` sh +deepspeed --num_gpus=8 --num_nodes=4 train.py \ + --micro_batch_size 16 --wandb_run_name mora-pretrain250m-r128 \ + --num_epochs 1 --wandb_project lora-pretrain --batch_size 1024 \ + --data_path --logging_steps 1 \ + --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,down_proj,up_proj \ + --lora_r 128 --lora_alpha 64 --warmup_steps 1000 \ + --force_tqdm_update --lr_scheduler_type cosine \ + --max_steps 10000 --pretrain 250m \ + --train_embhead --learning_rate 5e-4 \ + --use_mora --use_relora --use_relora_step 2000 # ReMoRA merge per 2000 steps +``` + +## Acknowledgement +Our Code is based on peft, alpaca-lora and ReLoRA diff --git a/MoRA/config.py b/MoRA/config.py new file mode 100644 index 0000000000000000000000000000000000000000..c08d8cc8d2f3eb2034e42e044cb82c5b163d8c8f --- /dev/null +++ b/MoRA/config.py @@ -0,0 +1,4 @@ +from peft_mora import LoraConfig + +class MoRAModelForCausalLM(LoraConfig): + pass diff --git a/MoRA/model.py b/MoRA/model.py new file mode 100644 index 0000000000000000000000000000000000000000..40c9e946d28be7517eb5fb6f20a0a795f4af0d32 --- /dev/null +++ b/MoRA/model.py @@ -0,0 +1,4 @@ +from peft_mora import PeftModelForCausalLM + +class MoRAModelForCausalLM(PeftModelForCausalLM): + pass diff --git a/MoRA/peft_mora/__init__.py b/MoRA/peft_mora/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a197236800e64ebe412319e0b490e29f0136ba16 --- /dev/null +++ b/MoRA/peft_mora/__init__.py @@ -0,0 +1,90 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all. + +# coding=utf-8 +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.9.0" + +from .auto import ( + AutoPeftModel, + AutoPeftModelForCausalLM, + AutoPeftModelForSequenceClassification, + AutoPeftModelForSeq2SeqLM, + AutoPeftModelForTokenClassification, + AutoPeftModelForQuestionAnswering, + AutoPeftModelForFeatureExtraction, +) +from .mapping import ( + MODEL_TYPE_TO_PEFT_MODEL_MAPPING, + PEFT_TYPE_TO_CONFIG_MAPPING, + get_peft_config, + get_peft_model, + inject_adapter_in_model, +) +from .mixed_model import PeftMixedModel +from .peft_model import ( + PeftModel, + PeftModelForCausalLM, + PeftModelForSeq2SeqLM, + PeftModelForSequenceClassification, + PeftModelForTokenClassification, + PeftModelForQuestionAnswering, + PeftModelForFeatureExtraction, +) +from .tuners import ( + AdaptionPromptConfig, + AdaptionPromptModel, + LoraConfig, + LoftQConfig, + LoraModel, + LoHaConfig, + LoHaModel, + LoKrConfig, + LoKrModel, + IA3Config, + IA3Model, + AdaLoraConfig, + AdaLoraModel, + PrefixEncoder, + PrefixTuningConfig, + PromptEmbedding, + PromptEncoder, + PromptEncoderConfig, + PromptEncoderReparameterizationType, + PromptTuningConfig, + PromptTuningInit, + MultitaskPromptTuningConfig, + MultitaskPromptTuningInit, + OFTConfig, + OFTModel, + PolyConfig, + PolyModel, +) +from .utils import ( + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, + PeftType, + TaskType, + bloom_model_postprocess_past_key_value, + get_peft_model_state_dict, + prepare_model_for_int8_training, + prepare_model_for_kbit_training, + set_peft_model_state_dict, + shift_tokens_right, + load_peft_weights, + cast_mixed_precision_params, +) +from .config import PeftConfig, PromptLearningConfig diff --git a/MoRA/peft_mora/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6f661d01e55501c00f3a27413836720bd25ae04 Binary files /dev/null and b/MoRA/peft_mora/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/__pycache__/auto.cpython-312.pyc b/MoRA/peft_mora/__pycache__/auto.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05dd40265a4c1a6a9ef5a8728b99513281f9f7f8 Binary files /dev/null and b/MoRA/peft_mora/__pycache__/auto.cpython-312.pyc differ diff --git a/MoRA/peft_mora/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..017717abf1965828f86eda40cb8743bd6856058d Binary files /dev/null and b/MoRA/peft_mora/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/__pycache__/import_utils.cpython-312.pyc b/MoRA/peft_mora/__pycache__/import_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d46c04d6e509a9e1879bbbf2f6c4dcf1a7dbd7f9 Binary files /dev/null and b/MoRA/peft_mora/__pycache__/import_utils.cpython-312.pyc differ diff --git a/MoRA/peft_mora/__pycache__/mapping.cpython-312.pyc b/MoRA/peft_mora/__pycache__/mapping.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..822bb5fd4572d22e2cff40c2cfdd4a89afb83f7b Binary files /dev/null and b/MoRA/peft_mora/__pycache__/mapping.cpython-312.pyc differ diff --git a/MoRA/peft_mora/__pycache__/mixed_model.cpython-312.pyc b/MoRA/peft_mora/__pycache__/mixed_model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ca9182613197a95d93f92edc8d6215ff49cd592 Binary files /dev/null and b/MoRA/peft_mora/__pycache__/mixed_model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/__pycache__/peft_model.cpython-312.pyc b/MoRA/peft_mora/__pycache__/peft_model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ae7ede202c501ebbb6baa3ba323770186865f9c4 Binary files /dev/null and b/MoRA/peft_mora/__pycache__/peft_model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/auto.py b/MoRA/peft_mora/auto.py new file mode 100644 index 0000000000000000000000000000000000000000..353c9e2f84c48bd61194102da6d6e83dfdcd42db --- /dev/null +++ b/MoRA/peft_mora/auto.py @@ -0,0 +1,170 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import importlib +import os +from typing import Optional + +from transformers import ( + AutoModel, + AutoModelForCausalLM, + AutoModelForQuestionAnswering, + AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification, + AutoModelForTokenClassification, + AutoTokenizer, +) + +from .config import PeftConfig +from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING +from .peft_model import ( + PeftModel, + PeftModelForCausalLM, + PeftModelForFeatureExtraction, + PeftModelForQuestionAnswering, + PeftModelForSeq2SeqLM, + PeftModelForSequenceClassification, + PeftModelForTokenClassification, +) +from .utils.constants import TOKENIZER_CONFIG_NAME +from .utils.other import check_file_exists_on_hf_hub + + +class _BaseAutoPeftModel: + _target_class = None + _target_peft_class = None + + def __init__(self, *args, **kwargs): + # For consistency with transformers: https://github.com/huggingface/transformers/blob/91d7df58b6537d385e90578dac40204cb550f706/src/transformers/models/auto/auto_factory.py#L400 + raise EnvironmentError( # noqa: UP024 + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + f"`{self.__class__.__name__}.from_config(config)` methods." + ) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path, + adapter_name: str = "default", + is_trainable: bool = False, + config: Optional[PeftConfig] = None, + **kwargs, + ): + r""" + A wrapper around all the preprocessing steps a user needs to perform in order to load a PEFT model. The kwargs + are passed along to `PeftConfig` that automatically takes care of filtering the kwargs of the Hub methods and + the config object init. + """ + peft_config = PeftConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + base_model_path = peft_config.base_model_name_or_path + + task_type = getattr(peft_config, "task_type", None) + + if cls._target_class is not None: + target_class = cls._target_class + elif cls._target_class is None and task_type is not None: + # this is only in the case where we use `AutoPeftModel` + raise ValueError( + "Cannot use `AutoPeftModel` with a task type, please use a specific class for your task type. (e.g. `AutoPeftModelForCausalLM` for `task_type='CAUSAL_LM'`)" + ) + + if task_type is not None: + expected_target_class = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[task_type] + if cls._target_peft_class.__name__ != expected_target_class.__name__: + raise ValueError( + f"Expected target PEFT class: {expected_target_class.__name__}, but you have asked for: {cls._target_peft_class.__name__ }" + " make sure that you are loading the correct model for your task type." + ) + elif task_type is None and getattr(peft_config, "auto_mapping", None) is not None: + auto_mapping = getattr(peft_config, "auto_mapping", None) + base_model_class = auto_mapping["base_model_class"] + parent_library_name = auto_mapping["parent_library"] + + parent_library = importlib.import_module(parent_library_name) + target_class = getattr(parent_library, base_model_class) + else: + raise ValueError( + "Cannot infer the auto class from the config, please make sure that you are loading the correct model for your task type." + ) + + base_model = target_class.from_pretrained(base_model_path, **kwargs) + + tokenizer_exists = False + if os.path.exists(os.path.join(pretrained_model_name_or_path, TOKENIZER_CONFIG_NAME)): + tokenizer_exists = True + else: + token = kwargs.get("token", None) + if token is None: + token = kwargs.get("use_auth_token", None) + + tokenizer_exists = check_file_exists_on_hf_hub( + repo_id=pretrained_model_name_or_path, + filename=TOKENIZER_CONFIG_NAME, + revision=kwargs.get("revision", None), + repo_type=kwargs.get("repo_type", None), + token=token, + ) + + if tokenizer_exists: + tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=kwargs.get("trust_remote_code", False) + ) + base_model.resize_token_embeddings(len(tokenizer)) + + return cls._target_peft_class.from_pretrained( + base_model, + pretrained_model_name_or_path, + adapter_name=adapter_name, + is_trainable=is_trainable, + config=config, + **kwargs, + ) + + +class AutoPeftModel(_BaseAutoPeftModel): + _target_class = None + _target_peft_class = PeftModel + + +class AutoPeftModelForCausalLM(_BaseAutoPeftModel): + _target_class = AutoModelForCausalLM + _target_peft_class = PeftModelForCausalLM + + +class AutoPeftModelForSeq2SeqLM(_BaseAutoPeftModel): + _target_class = AutoModelForSeq2SeqLM + _target_peft_class = PeftModelForSeq2SeqLM + + +class AutoPeftModelForSequenceClassification(_BaseAutoPeftModel): + _target_class = AutoModelForSequenceClassification + _target_peft_class = PeftModelForSequenceClassification + + +class AutoPeftModelForTokenClassification(_BaseAutoPeftModel): + _target_class = AutoModelForTokenClassification + _target_peft_class = PeftModelForTokenClassification + + +class AutoPeftModelForQuestionAnswering(_BaseAutoPeftModel): + _target_class = AutoModelForQuestionAnswering + _target_peft_class = PeftModelForQuestionAnswering + + +class AutoPeftModelForFeatureExtraction(_BaseAutoPeftModel): + _target_class = AutoModel + _target_peft_class = PeftModelForFeatureExtraction diff --git a/MoRA/peft_mora/config.py b/MoRA/peft_mora/config.py new file mode 100644 index 0000000000000000000000000000000000000000..83eb047f3f62112e02030149e3b4a496cee1555c --- /dev/null +++ b/MoRA/peft_mora/config.py @@ -0,0 +1,270 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect +import json +import os +from dataclasses import asdict, dataclass, field +from typing import Dict, Optional, Union + +from huggingface_hub import hf_hub_download +from transformers.utils import PushToHubMixin + +from .utils import CONFIG_NAME, PeftType, TaskType + + +@dataclass +class PeftConfigMixin(PushToHubMixin): + r""" + This is the base configuration class for PEFT adapter models. It contains all the methods that are common to all + PEFT adapter models. This class inherits from [`~transformers.utils.PushToHubMixin`] which contains the methods to + push your model to the Hub. The method `save_pretrained` will save the configuration of your adapter model in a + directory. The method `from_pretrained` will load the configuration of your adapter model from a directory. + + Args: + peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use. + """ + + peft_type: Optional[PeftType] = field(default=None, metadata={"help": "The type of PEFT model."}) + auto_mapping: Optional[dict] = field( + default=None, metadata={"help": "An auto mapping dict to help retrieve the base model class if needed."} + ) + + def to_dict(self) -> Dict: + r""" + Returns the configuration for your adapter model as a dictionary. + """ + return asdict(self) + + def save_pretrained(self, save_directory: str, **kwargs) -> None: + r""" + This method saves the configuration of your adapter model in a directory. + + Args: + save_directory (`str`): + The directory where the configuration will be saved. + kwargs (additional keyword arguments, *optional*): + Additional keyword arguments passed along to the [`~transformers.utils.PushToHubMixin.push_to_hub`] + method. + """ + if os.path.isfile(save_directory): + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + + os.makedirs(save_directory, exist_ok=True) + auto_mapping_dict = kwargs.pop("auto_mapping_dict", None) + + output_dict = asdict(self) + # converting set type to list + for key, value in output_dict.items(): + if isinstance(value, set): + output_dict[key] = list(value) + + output_path = os.path.join(save_directory, CONFIG_NAME) + + # Add auto mapping details for custom models. + if auto_mapping_dict is not None: + output_dict["auto_mapping"] = auto_mapping_dict + + # save it + with open(output_path, "w") as writer: + writer.write(json.dumps(output_dict, indent=2, sort_keys=True)) + + @classmethod + def from_peft_type(cls, **kwargs): + r""" + This method loads the configuration of your adapter model from a set of kwargs. + + The appropriate configuration type is determined by the `peft_type` argument. If `peft_type` is not provided, + the calling class type is instantiated. + + Args: + kwargs (configuration keyword arguments): + Keyword arguments passed along to the configuration initialization. + """ + # Avoid circular dependency .. TODO: fix this with a larger refactor + from peft_mora.mapping import PEFT_TYPE_TO_CONFIG_MAPPING + + # TODO: this hack is needed to fix the following issue (on commit 702f937): + # if someone saves a default config and loads it back with `PeftConfig` class it yields to + # not loading the correct config class. + + # from peft import AdaLoraConfig, PeftConfig + # peft_config = AdaLoraConfig() + # print(peft_config) + # >>> AdaLoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, + # revision=None, task_type=None, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, ... + # + # peft_config.save_pretrained("./test_config") + # peft_config = PeftConfig.from_pretrained("./test_config") + # print(peft_config) + # >>> PeftConfig(peft_type='ADALORA', auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False) + + if "peft_type" in kwargs: + peft_type = kwargs["peft_type"] + config_cls = PEFT_TYPE_TO_CONFIG_MAPPING[peft_type] + else: + config_cls = cls + + return config_cls(**kwargs) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional[str] = None, **kwargs): + r""" + This method loads the configuration of your adapter model from a directory. + + Args: + pretrained_model_name_or_path (`str`): + The directory or the Hub repository id where the configuration is saved. + kwargs (additional keyword arguments, *optional*): + Additional keyword arguments passed along to the child class initialization. + """ + path = ( + os.path.join(pretrained_model_name_or_path, subfolder) + if subfolder is not None + else pretrained_model_name_or_path + ) + + hf_hub_download_kwargs, class_kwargs, _ = cls._split_kwargs(kwargs) + + if os.path.isfile(os.path.join(path, CONFIG_NAME)): + config_file = os.path.join(path, CONFIG_NAME) + else: + try: + config_file = hf_hub_download( + pretrained_model_name_or_path, CONFIG_NAME, subfolder=subfolder, **hf_hub_download_kwargs + ) + except Exception: + raise ValueError(f"Can't find '{CONFIG_NAME}' at '{pretrained_model_name_or_path}'") + + loaded_attributes = cls.from_json_file(config_file) + kwargs = {**class_kwargs, **loaded_attributes} + return cls.from_peft_type(**kwargs) + + @classmethod + def from_json_file(cls, path_json_file: str, **kwargs): + r""" + Loads a configuration file from a json file. + + Args: + path_json_file (`str`): + The path to the json file. + """ + with open(path_json_file) as file: + json_object = json.load(file) + + return json_object + + @classmethod + def _split_kwargs(cls, kwargs): + hf_hub_download_kwargs = {} + class_kwargs = {} + other_kwargs = {} + + for key, value in kwargs.items(): + if key in inspect.signature(hf_hub_download).parameters: + hf_hub_download_kwargs[key] = value + elif key in list(cls.__annotations__): + class_kwargs[key] = value + else: + other_kwargs[key] = value + + return hf_hub_download_kwargs, class_kwargs, other_kwargs + + @classmethod + def _get_peft_type( + cls, + model_id: str, + **hf_hub_download_kwargs, + ): + subfolder = hf_hub_download_kwargs.get("subfolder", None) + + path = os.path.join(model_id, subfolder) if subfolder is not None else model_id + + if os.path.isfile(os.path.join(path, CONFIG_NAME)): + config_file = os.path.join(path, CONFIG_NAME) + else: + try: + config_file = hf_hub_download( + model_id, + CONFIG_NAME, + **hf_hub_download_kwargs, + ) + except Exception: + raise ValueError(f"Can't find '{CONFIG_NAME}' at '{model_id}'") + + loaded_attributes = cls.from_json_file(config_file) + return loaded_attributes["peft_type"] + + @property + def is_prompt_learning(self) -> bool: + r""" + Utility method to check if the configuration is for prompt learning. + """ + return False + + @property + def is_adaption_prompt(self) -> bool: + """Return True if this is an adaption prompt config.""" + return False + + +@dataclass +class PeftConfig(PeftConfigMixin): + """ + This is the base configuration class to store the configuration of a [`PeftModel`]. + + Args: + peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use. + task_type (Union[[`~peft.utils.config.TaskType`], `str`]): The type of task to perform. + inference_mode (`bool`, defaults to `False`): Whether to use the Peft model in inference mode. + """ + + base_model_name_or_path: Optional[str] = field( + default=None, metadata={"help": "The name of the base model to use."} + ) + revision: Optional[str] = field(default=None, metadata={"help": "The specific model version to use."}) + peft_type: Optional[Union[str, PeftType]] = field(default=None, metadata={"help": "Peft type"}) + task_type: Optional[Union[str, TaskType]] = field(default=None, metadata={"help": "Task type"}) + inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"}) + + +@dataclass +class PromptLearningConfig(PeftConfig): + """ + This is the base configuration class to store the configuration of [`PrefixTuning`], [`PromptEncoder`], or + [`PromptTuning`]. + + Args: + num_virtual_tokens (`int`): The number of virtual tokens to use. + token_dim (`int`): The hidden embedding dimension of the base transformer model. + num_transformer_submodules (`int`): The number of transformer submodules in the base transformer model. + num_attention_heads (`int`): The number of attention heads in the base transformer model. + num_layers (`int`): The number of layers in the base transformer model. + """ + + num_virtual_tokens: int = field(default=None, metadata={"help": "Number of virtual tokens"}) + token_dim: int = field( + default=None, metadata={"help": "The hidden embedding dimension of the base transformer model"} + ) + num_transformer_submodules: Optional[int] = field( + default=None, metadata={"help": "Number of transformer submodules"} + ) + num_attention_heads: Optional[int] = field(default=None, metadata={"help": "Number of attention heads"}) + num_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer layers"}) + + @property + def is_prompt_learning(self) -> bool: + r""" + Utility method to check if the configuration is for prompt learning. + """ + return True diff --git a/MoRA/peft_mora/helpers.py b/MoRA/peft_mora/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..8875ff7fc493ae4dfff11a1d8e4485b330cb27dc --- /dev/null +++ b/MoRA/peft_mora/helpers.py @@ -0,0 +1,113 @@ +import inspect +from copy import deepcopy +from functools import update_wrapper +from types import MethodType + +from .peft_model import PeftModel + + +def update_forward_signature(model: PeftModel) -> None: + """ + Args: + Updates the forward signature of the PeftModel to include parents class signature + model (`PeftModel`): Peft model to update the forward signature + Example: + + ```python + >>> from transformers import WhisperForConditionalGeneration + >>> from peft import get_peft_model, LoraConfig, update_forward_signature + + >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") + >>> peft_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj", "v_proj"]) + + >>> peft_model = get_peft_model(model, peft_config) + >>> update_forward_signature(peft_model) + ``` + """ + + # Only update signature when the current forward signature only has *args and **kwargs + current_signature = inspect.signature(model.forward) + if ( + len(current_signature.parameters) == 2 + and "args" in current_signature.parameters + and "kwargs" in current_signature.parameters + ): + forward = deepcopy(model.forward.__func__) + update_wrapper( + forward, type(model.get_base_model()).forward, assigned=("__doc__", "__name__", "__annotations__") + ) + model.forward = MethodType(forward, model) + + +def update_generate_signature(model: PeftModel) -> None: + """ + Args: + Updates the generate signature of a PeftModel with overriding generate to include parents class signature + model (`PeftModel`): Peft model to update the generate signature + Example: + + ```python + >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + >>> from peft import get_peft_model, LoraConfig, TaskType, update_generate_signature + + >>> model_name_or_path = "bigscience/mt0-large" + >>> tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + >>> model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) + + >>> peft_config = LoraConfig( + ... task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1 + ... ) + >>> peft_model = get_peft_model(model, peft_config) + >>> update_generate_signature(peft_model) + >>> help(peft_model.generate) + ``` + """ + if not hasattr(model, "generate"): + return + current_signature = inspect.signature(model.generate) + if ( + len(current_signature.parameters) == 2 + and "args" in current_signature.parameters + and "kwargs" in current_signature.parameters + ) or (len(current_signature.parameters) == 1 and "kwargs" in current_signature.parameters): + generate = deepcopy(model.generate.__func__) + update_wrapper( + generate, + type(model.get_base_model()).generate, + assigned=("__doc__", "__name__", "__annotations__"), + ) + model.generate = MethodType(generate, model) + + +def update_signature(model: PeftModel, method: str = "all") -> None: + """ + Args: + Updates the signature of a PeftModel include parents class signature for forward or generate method + model (`PeftModel`): Peft model to update generate or forward signature method (`str`): method to update + signature choose one of "forward", "generate", "all" + Example: + ```python + >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer + >>> from peft import get_peft_model, LoraConfig, TaskType, update_signature + + >>> model_name_or_path = "bigscience/mt0-large" + >>> tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + >>> model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path) + + >>> peft_config = LoraConfig( + ... task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1 + ... ) + >>> peft_model = get_peft_model(model, peft_config) + >>> update_signature(peft_model) + >>> help(peft_model.generate) + ``` + """ + if method == "forward": + update_forward_signature(model) + elif method == "generate": + update_generate_signature(model) + elif method == "all": + update_forward_signature(model) + update_generate_signature(model) + else: + raise ValueError(f"method {method} is not supported please choose one of ['forward', 'generate', 'all']") diff --git a/MoRA/peft_mora/import_utils.py b/MoRA/peft_mora/import_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6c32d96d52e74bd5de879c06c732fbf82417a8b6 --- /dev/null +++ b/MoRA/peft_mora/import_utils.py @@ -0,0 +1,73 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +import importlib.metadata as importlib_metadata +from functools import lru_cache + +import packaging.version + + +def is_bnb_available() -> bool: + return importlib.util.find_spec("bitsandbytes") is not None + + +def is_bnb_4bit_available() -> bool: + if not is_bnb_available(): + return False + + import bitsandbytes as bnb + + return hasattr(bnb.nn, "Linear4bit") + + +def is_auto_gptq_available(): + if importlib.util.find_spec("auto_gptq") is not None: + AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.5.0") + version_autogptq = packaging.version.parse(importlib_metadata.version("auto_gptq")) + if AUTOGPTQ_MINIMUM_VERSION <= version_autogptq: + return True + else: + raise ImportError( + f"Found an incompatible version of auto-gptq. Found version {version_autogptq}, " + f"but only versions above {AUTOGPTQ_MINIMUM_VERSION} are supported" + ) + + +def is_optimum_available() -> bool: + return importlib.util.find_spec("optimum") is not None + + +@lru_cache +def is_torch_tpu_available(check_device=True): + "Checks if `torch_xla` is installed and potentially if a TPU is in the environment" + if importlib.util.find_spec("torch_xla") is not None: + if check_device: + # We need to check if `xla_device` can be found, will raise a RuntimeError if not + try: + import torch_xla.core.xla_model as xm + + _ = xm.xla_device() + return True + except RuntimeError: + return False + return True + return False + + +def is_aqlm_available(): + return importlib.util.find_spec("aqlm") is not None + + +def is_auto_awq_available(): + return importlib.util.find_spec("awq") is not None diff --git a/MoRA/peft_mora/mapping.py b/MoRA/peft_mora/mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..b62ddf94aafa1a32b2711c0a6e365900065a93b4 --- /dev/null +++ b/MoRA/peft_mora/mapping.py @@ -0,0 +1,168 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import torch + +from .config import PeftConfig +from .mixed_model import PeftMixedModel +from .peft_model import ( + PeftModel, + PeftModelForCausalLM, + PeftModelForFeatureExtraction, + PeftModelForQuestionAnswering, + PeftModelForSeq2SeqLM, + PeftModelForSequenceClassification, + PeftModelForTokenClassification, +) +from .tuners import ( + AdaLoraConfig, + AdaLoraModel, + AdaptionPromptConfig, + IA3Config, + IA3Model, + LoHaConfig, + LoHaModel, + LoKrConfig, + LoKrModel, + LoraConfig, + LoraModel, + MultitaskPromptTuningConfig, + OFTConfig, + OFTModel, + PolyConfig, + PolyModel, + PrefixTuningConfig, + PromptEncoderConfig, + PromptTuningConfig, +) +from .utils import _prepare_prompt_learning_config + + +if TYPE_CHECKING: + from transformers import PreTrainedModel + + +MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, PeftModel] = { + "SEQ_CLS": PeftModelForSequenceClassification, + "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM, + "CAUSAL_LM": PeftModelForCausalLM, + "TOKEN_CLS": PeftModelForTokenClassification, + "QUESTION_ANS": PeftModelForQuestionAnswering, + "FEATURE_EXTRACTION": PeftModelForFeatureExtraction, +} + +PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, PeftConfig] = { + "ADAPTION_PROMPT": AdaptionPromptConfig, + "PROMPT_TUNING": PromptTuningConfig, + "PREFIX_TUNING": PrefixTuningConfig, + "P_TUNING": PromptEncoderConfig, + "LORA": LoraConfig, + "LOHA": LoHaConfig, + "LOKR": LoKrConfig, + "ADALORA": AdaLoraConfig, + "IA3": IA3Config, + "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig, + "OFT": OFTConfig, + "POLY": PolyConfig, +} + +PEFT_TYPE_TO_TUNER_MAPPING = { + "LORA": LoraModel, + "LOHA": LoHaModel, + "LOKR": LoKrModel, + "ADALORA": AdaLoraModel, + "IA3": IA3Model, + "OFT": OFTModel, + "POLY": PolyModel, +} + + +def get_peft_config(config_dict: dict[str, Any]) -> PeftConfig: + """ + Returns a Peft config object from a dictionary. + + Args: + config_dict (`Dict[str, Any]`): Dictionary containing the configuration parameters. + """ + + return PEFT_TYPE_TO_CONFIG_MAPPING[config_dict["peft_type"]](**config_dict) + + +def get_peft_model( + model: PreTrainedModel, peft_config: PeftConfig, adapter_name: str = "default", mixed: bool = False +) -> PeftModel | PeftMixedModel: + """ + Returns a Peft model object from a model and a config. + + Args: + model ([`transformers.PreTrainedModel`]): + Model to be wrapped. + peft_config ([`PeftConfig`]): + Configuration object containing the parameters of the Peft model. + adapter_name (`str`, `optional`, defaults to `"default"`): + The name of the adapter to be injected, if not provided, the default adapter name is used ("default"). + mixed (`bool`, `optional`, defaults to `False`): + Whether to allow mixing different (compatible) adapter types. + """ + model_config = getattr(model, "config", {"model_type": "custom"}) + if hasattr(model_config, "to_dict"): + model_config = model_config.to_dict() + + peft_config.base_model_name_or_path = model.__dict__.get("name_or_path", None) + + if mixed: + return PeftMixedModel(model, peft_config, adapter_name=adapter_name) + + if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning: + return PeftModel(model, peft_config, adapter_name=adapter_name) + + if peft_config.is_prompt_learning: + peft_config = _prepare_prompt_learning_config(peft_config, model_config) + return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](model, peft_config, adapter_name=adapter_name) + + +def inject_adapter_in_model( + peft_config: PeftConfig, model: torch.nn.Module, adapter_name: str = "default" +) -> torch.nn.Module: + r""" + A simple API to create and inject adapter in-place into a model. Currently the API does not support prompt learning + methods and adaption prompt. Make sure to have the correct `target_names` set in the `peft_config` object. The API + calls `get_peft_model` under the hood but would be restricted only to non-prompt learning methods. + + Args: + peft_config (`PeftConfig`): + Configuration object containing the parameters of the Peft model. + model (`torch.nn.Module`): + The input model where the adapter will be injected. + adapter_name (`str`, `optional`, defaults to `"default"`): + The name of the adapter to be injected, if not provided, the default adapter name is used ("default"). + """ + if peft_config.is_prompt_learning or peft_config.is_adaption_prompt: + raise ValueError("`create_and_replace` does not support prompt learning and adaption prompt yet.") + + if peft_config.peft_type not in PEFT_TYPE_TO_TUNER_MAPPING.keys(): + raise ValueError( + f"`inject_adapter_in_model` does not support {peft_config.peft_type} yet. Please use `get_peft_model`." + ) + + tuner_cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type] + + # By instantiating a peft model we are injecting randomly initialized LoRA layers into the model's modules. + peft_model = tuner_cls(model, peft_config, adapter_name=adapter_name) + + return peft_model.model diff --git a/MoRA/peft_mora/mixed_model.py b/MoRA/peft_mora/mixed_model.py new file mode 100644 index 0000000000000000000000000000000000000000..bd0fff0e3497924af536bafbc12d05bff3e5fd79 --- /dev/null +++ b/MoRA/peft_mora/mixed_model.py @@ -0,0 +1,402 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import os +from contextlib import contextmanager +from typing import Any, Optional, Union + +import torch +from accelerate.hooks import remove_hook_from_submodules +from torch import nn +from transformers.utils import PushToHubMixin + +from peft_mora.tuners.mixed import COMPATIBLE_TUNER_TYPES + +from .config import PeftConfig +from .peft_model import PeftModel +from .tuners import ( + AdaLoraModel, + IA3Model, + LoHaModel, + LoKrModel, + LoraModel, + MixedModel, + OFTModel, +) +from .utils import PeftType, _set_adapter, _set_trainable + + +PEFT_TYPE_TO_MODEL_MAPPING = { + PeftType.LORA: LoraModel, + PeftType.LOHA: LoHaModel, + PeftType.LOKR: LoKrModel, + PeftType.ADALORA: AdaLoraModel, + PeftType.IA3: IA3Model, + PeftType.OFT: OFTModel, +} + + +def _prepare_model_for_gradient_checkpointing(model: nn.Module) -> None: + r""" + Prepares the model for gradient checkpointing if necessary + """ + # Note: same as PeftModel._prepare_model_for_gradient_checkpointing + if not getattr(model, "is_gradient_checkpointing", True): + return model + + if not ( + getattr(model, "is_loaded_in_8bit", False) + or getattr(model, "is_loaded_in_4bit", False) + or getattr(model, "is_quantized", False) + ): + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + elif hasattr(model, "get_input_embeddings"): + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + +def _check_config_compatible(peft_config: PeftConfig) -> None: + if peft_config.peft_type not in COMPATIBLE_TUNER_TYPES: + raise ValueError( + f"The provided `peft_type` '{peft_config.peft_type.value}' is not compatible with the `PeftMixedModel`. " + f"Compatible types are: {COMPATIBLE_TUNER_TYPES}" + ) + + +class PeftMixedModel(PushToHubMixin, torch.nn.Module): + """ + PeftMixedModel for loading mixing different types of adapters for inference. + + This class does not support loading/saving, and it shouldn't usually be initialized directly. Instead, use + `get_peft_model` with the argument `mixed=True`. + + + + Read the [Mixed adapter types](https://huggingface.co/docs/peft/en/developer_guides/mixed_models) guide to learn + more about using different adapter types. + + + + Example: + + ```py + >>> from peft import get_peft_model + + >>> base_model = ... # load the base model, e.g. from transformers + >>> peft_model = PeftMixedModel.from_pretrained(base_model, path_to_adapter1, "adapter1").eval() + >>> peft_model.load_adapter(path_to_adapter2, "adapter2") + >>> peft_model.set_adapter(["adapter1", "adapter2"]) # activate both adapters + >>> peft_model(data) # forward pass using both adapters + ``` + + Args: + model (`torch.nn.Module`): + The model to be tuned. + config (`PeftConfig`): + The config of the model to be tuned. The adapter type must be compatible. + adapter_name (`str`, `optional`, defaults to `"default"`): + The name of the first adapter. + """ + + def __init__(self, model: nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None: + super().__init__() + _check_config_compatible(peft_config) + _prepare_model_for_gradient_checkpointing(model) + self.modules_to_save = None + self.base_model = MixedModel(model, {adapter_name: peft_config}, adapter_name) + self.set_modules_to_save(peft_config, adapter_name) + + self.config = getattr(model, "config", {"model_type": "custom"}) + + # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid + # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected + # behavior we disable that in this line. + if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"): + self.base_model.config.pretraining_tp = 1 + + @property + def peft_config(self) -> dict[str, PeftConfig]: + return self.base_model.peft_config + + @property + def active_adapter(self) -> str: + return self.base_model.active_adapter + + @property + def active_adapters(self) -> list[str]: + return self.base_model.active_adapters + + def get_nb_trainable_parameters(self): + r""" + Returns the number of trainable parameters and number of all parameters in the model. + """ + # note: same as PeftModel.get_nb_trainable_parameters + trainable_params = 0 + all_param = 0 + for _, param in self.named_parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + num_params = param.ds_numel + + # Due to the design of 4bit linear layers from bitsandbytes + # one needs to multiply the number of parameters by 2 to get + # the correct number of parameters + if param.__class__.__name__ == "Params4bit": + num_params = num_params * 2 + + all_param += num_params + if param.requires_grad: + trainable_params += num_params + + return trainable_params, all_param + + def print_trainable_parameters(self): + """ + Prints the number of trainable parameters in the model. + """ + # note: same as PeftModel.print_trainable_parameters + trainable_params, all_param = self.get_nb_trainable_parameters() + + print( + f"trainable params: {trainable_params:,d} || " + f"all params: {all_param:,d} || " + f"trainable%: {100 * trainable_params / all_param:.4f}" + ) + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.base_model, name) + + def forward(self, *args: Any, **kwargs: Any): + """ + Forward pass of the model. + """ + return self.base_model(*args, **kwargs) + + def generate(self, *args: Any, **kwargs: Any): + """ + Generate output. + """ + return self.base_model.generate(*args, **kwargs) + + @contextmanager + def disable_adapter(self): + """ + Disables the adapter module. + """ + try: + self.base_model.disable_adapter_layers() + yield + finally: + self.base_model.enable_adapter_layers() + + def add_adapter(self, adapter_name: str, peft_config: PeftConfig): + _check_config_compatible(peft_config) + + try: + self.peft_config[adapter_name] = peft_config + self.base_model.inject_adapter(self, adapter_name) + except Exception: # something went wrong, roll back + if adapter_name in self.peft_config: + del self.peft_config[adapter_name] + raise + + self.set_modules_to_save(peft_config, adapter_name) + + def set_modules_to_save(self, peft_config: PeftConfig, adapter_name: str) -> None: + if (modules_to_save := getattr(peft_config, "modules_to_save", None)) is None: + return + + if self.modules_to_save is None: + self.modules_to_save = set(modules_to_save) + else: + self.modules_to_save.update(modules_to_save) + _set_trainable(self, adapter_name) + + def set_adapter(self, adapter_name: Union[str, list[str]]) -> None: + """ + Sets the active adapter(s) for the model. + + Note that the order in which the adapters are applied during the forward pass may not be the same as the order + in which they are passed to this function. Instead, the order during the forward pass is determined by the + order in which the adapters were loaded into the model. The active adapters only determine which adapters are + active during the forward pass, but not the order in which they are applied. + + Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (`str` or `List[str]`): + The name of the adapter(s) to be activated. + """ + if isinstance(adapter_name, str): + adapter_name = [adapter_name] + + mismatched = set(adapter_name) - set(self.peft_config.keys()) + if mismatched: + raise ValueError( + f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}" + ) + + self.base_model.set_adapter(adapter_name) + _set_adapter(self, adapter_name) + + def delete_adapter(self, adapter_name: Union[str, list[str]]) -> None: + if isinstance(adapter_name, str): + adapter_name = [adapter_name] + + mismatched = set(adapter_name) - set(self.peft_config.keys()) + if mismatched: + raise ValueError( + f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}" + ) + + self.base_model.delete_adapter(adapter_name) + + def merge_and_unload(self, *args: Any, **kwargs: Any): + r""" + This method merges the adapter layers into the base model. This is needed if someone wants to use the base + model as a standalone model. + + Args: + progressbar (`bool`): + whether to show a progressbar indicating the unload and merge process + safe_merge (`bool`): + whether to activate the safe merging check to check if there is any potential Nan in the adapter + weights + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + return self.base_model.merge_and_unload(*args, **kwargs) + + def unload(self, *args: Any, **kwargs: Any): + """ + Gets back the base model by removing all the adapter modules without merging. This gives back the original base + model. + """ + return self.base_model.unload(*args, **kwargs) + + @classmethod + def _split_kwargs(cls, kwargs: dict[str, Any]): + return PeftModel._split_kwargs(kwargs) + + def load_adapter(self, model_id: str, adapter_name: str, *args: Any, **kwargs: Any): + output = PeftModel.load_adapter(self, model_id, adapter_name, *args, **kwargs) + # TODO: not quite clear why this is necessary but tests fail without it + self.set_adapter(self.active_adapters) + return output + + def create_or_update_model_card(self, output_dir: str): + raise NotImplementedError(f"Model card creation is not supported for {self.__class__.__name__} (yet).") + + def save_pretrained( + self, + save_directory: str, + safe_serialization: bool = False, + selected_adapters: Optional[list[str]] = None, + **kwargs: Any, + ): + raise NotImplementedError(f"Saving is not supported for {self.__class__.__name__} (yet).") + + @classmethod + def from_pretrained( + cls, + model: nn.Module, + model_id: str | os.PathLike, + adapter_name: str = "default", + is_trainable: bool = False, + config: Optional[PeftConfig] = None, + **kwargs: Any, + ): + r""" + Instantiate a PEFT mixed model from a pretrained model and loaded PEFT weights. + + Note that the passed `model` may be modified inplace. + + Args: + model (`nn.Module`): + The model to be adapted. + model_id (`str` or `os.PathLike`): + The name of the PEFT configuration to use. Can be either: + - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face + Hub. + - A path to a directory containing a PEFT configuration file saved using the `save_pretrained` + method (`./my_peft_config_directory/`). + adapter_name (`str`, *optional*, defaults to `"default"`): + The name of the adapter to be loaded. This is useful for loading multiple adapters. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and use for + inference + config ([`~peft.PeftConfig`], *optional*): + The configuration object to use instead of an automatically loaded configuration. This configuration + object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already + loaded before calling `from_pretrained`. + kwargs: (`optional`): + Additional keyword arguments passed along to the specific PEFT configuration class. + """ + # note: adapted from PeftModel.from_pretrained + from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING + + # load the config + if config is None: + config = PEFT_TYPE_TO_CONFIG_MAPPING[ + PeftConfig._get_peft_type( + model_id, + subfolder=kwargs.get("subfolder", None), + revision=kwargs.get("revision", None), + cache_dir=kwargs.get("cache_dir", None), + use_auth_token=kwargs.get("use_auth_token", None), + ) + ].from_pretrained(model_id, **kwargs) + elif isinstance(config, PeftConfig): + config.inference_mode = not is_trainable + else: + raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}") + + # note: this is different from PeftModel.from_pretrained + if config.peft_type not in PEFT_TYPE_TO_MODEL_MAPPING: + raise ValueError(f"Adapter of type {config.peft_type} is not supported for mixed models.") + + if (getattr(model, "hf_device_map", None) is not None) and len( + set(model.hf_device_map.values()).intersection({"cpu", "disk"}) + ) > 0: + remove_hook_from_submodules(model) + + if config.is_prompt_learning and is_trainable: + # note: should not be possible to reach, but just in case + raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") + else: + config.inference_mode = not is_trainable + + # note: this is different from PeftModel.from_pretrained, we always return a PeftMixedModel + model = cls(model, config, adapter_name) + model.load_adapter(model_id, adapter_name, is_trainable=is_trainable, **kwargs) + return model diff --git a/MoRA/peft_mora/peft_model.py b/MoRA/peft_mora/peft_model.py new file mode 100644 index 0000000000000000000000000000000000000000..88bfb070e921d5e1def04d4f8a7eaa99f290d617 --- /dev/null +++ b/MoRA/peft_mora/peft_model.py @@ -0,0 +1,1929 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import collections +import inspect +import os +import warnings +from contextlib import contextmanager +from copy import deepcopy +from typing import Any, Optional, Union + +import packaging.version +import torch +import transformers +from accelerate import dispatch_model, infer_auto_device_map +from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules +from accelerate.utils import get_balanced_memory +from huggingface_hub import ModelCard, ModelCardData, hf_hub_download +from safetensors.torch import save_file as safe_save_file +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers import PreTrainedModel +from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput +from transformers.utils import PushToHubMixin + +from . import __version__ +from .config import PeftConfig +from .tuners import ( + AdaLoraModel, + AdaptionPromptModel, + IA3Model, + LoHaModel, + LoKrModel, + LoraModel, + MultitaskPromptEmbedding, + OFTModel, + PolyModel, + PrefixEncoder, + PromptEmbedding, + PromptEncoder, +) +from .utils import ( + SAFETENSORS_WEIGHTS_NAME, + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, + WEIGHTS_NAME, + PeftType, + TaskType, + _get_batch_size, + _prepare_prompt_learning_config, + _set_adapter, + _set_trainable, + get_peft_model_state_dict, + id_tensor_storage, + infer_device, + load_peft_weights, + set_peft_model_state_dict, + shift_tokens_right, +) + + +PEFT_TYPE_TO_MODEL_MAPPING = { + PeftType.LORA: LoraModel, + PeftType.LOHA: LoHaModel, + PeftType.LOKR: LoKrModel, + PeftType.PROMPT_TUNING: PromptEmbedding, + PeftType.P_TUNING: PromptEncoder, + PeftType.PREFIX_TUNING: PrefixEncoder, + PeftType.ADALORA: AdaLoraModel, + PeftType.ADAPTION_PROMPT: AdaptionPromptModel, + PeftType.IA3: IA3Model, + PeftType.OFT: OFTModel, + PeftType.POLY: PolyModel, +} + + +class PeftModel(PushToHubMixin, torch.nn.Module): + """ + Base model encompassing various Peft methods. + + Args: + model ([`~transformers.PreTrainedModel`]): The base transformer model used for Peft. + peft_config ([`PeftConfig`]): The configuration of the Peft model. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + + **Attributes**: + - **base_model** ([`torch.nn.Module`]) -- The base transformer model used for Peft. + - **peft_config** ([`PeftConfig`]) -- The configuration of the Peft model. + - **modules_to_save** (`list` of `str`) -- The list of sub-module names to save when + saving the model. + - **prompt_encoder** ([`PromptEncoder`]) -- The prompt encoder used for Peft if + using [`PromptLearningConfig`]. + - **prompt_tokens** (`torch.Tensor`) -- The virtual prompt tokens used for Peft if + using [`PromptLearningConfig`]. + - **transformer_backbone_name** (`str`) -- The name of the transformer + backbone in the base model if using [`PromptLearningConfig`]. + - **word_embeddings** (`torch.nn.Embedding`) -- The word embeddings of the transformer backbone + in the base model if using [`PromptLearningConfig`]. + """ + + def __init__(self, model: PreTrainedModel, peft_config: PeftConfig, adapter_name: str = "default") -> None: + super().__init__() + self.modules_to_save = None + self.active_adapter = adapter_name + self.peft_type = peft_config.peft_type + + self._is_prompt_learning = peft_config.is_prompt_learning + if self._is_prompt_learning: + self._peft_config = {adapter_name: peft_config} + self.base_model = model + self.add_adapter(adapter_name, peft_config) + else: + self._peft_config = None + cls = PEFT_TYPE_TO_MODEL_MAPPING[peft_config.peft_type] + self.base_model = cls(model, {adapter_name: peft_config}, adapter_name) + self.set_additional_trainable_modules(peft_config, adapter_name) + + if getattr(model, "is_gradient_checkpointing", True): + model = self._prepare_model_for_gradient_checkpointing(model) + + # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid + # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected + # behavior we disable that in this line. + if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"): + self.base_model.config.pretraining_tp = 1 + + @property + def peft_config(self) -> dict[str, PeftConfig]: + if self._is_prompt_learning: + return self._peft_config + return self.base_model.peft_config + + @property + def active_adapters(self) -> list[str]: + try: + adapters = self.base_model.active_adapters + except AttributeError: + adapters = self.active_adapter + if isinstance(adapters, str): + adapters = [adapters] + return adapters + + @peft_config.setter + def peft_config(self, value: dict[str, PeftConfig]): + if self._is_prompt_learning: + self._peft_config = value + else: + self.base_model.peft_config = value + + def save_pretrained( + self, + save_directory: str, + safe_serialization: bool = True, + selected_adapters: Optional[list[str]] = None, + save_embedding_layers: Union[str, bool] = "auto", + is_main_process: bool = True, + **kwargs: Any, + ) -> None: + r""" + This function saves the adapter model and the adapter configuration files to a directory, so that it can be + reloaded using the [`PeftModel.from_pretrained`] class method, and also used by the [`PeftModel.push_to_hub`] + method. + + Args: + save_directory (`str`): + Directory where the adapter model and configuration files will be saved (will be created if it does not + exist). + safe_serialization (`bool`, *optional*): + Whether to save the adapter files in safetensors format, defaults to `True`. + selected_adapters (`List[str]`, *optional*): + A list of adapters to be saved. If `None`, will default to all adapters. + save_embedding_layers (`Union[bool, str]`, *optional*, defaults to `"auto"`): + If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common + embedding layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available. + and automatically sets the boolean flag. This only works for πŸ€— transformers models. + is_main_process (`bool`, *optional*): + Whether the process calling this is the main process or not. Will default to `True`. Will not save the + checkpoint if not on the main process, which is important for multi device setups (e.g. DDP). + kwargs (additional keyword arguments, *optional*): + Additional keyword arguments passed along to the `push_to_hub` method. + """ + if os.path.isfile(save_directory): + raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file") + + if selected_adapters is None: + selected_adapters = list(self.peft_config.keys()) + else: + if any( + selected_adapter_name not in list(self.peft_config.keys()) + for selected_adapter_name in selected_adapters + ): + raise ValueError( + f"You passed an invalid `selected_adapters` arguments, current supported adapter names are" + f" {list(self.peft_config.keys())} - got {selected_adapters}." + ) + + if is_main_process: + os.makedirs(save_directory, exist_ok=True) + self.create_or_update_model_card(save_directory) + + for adapter_name in selected_adapters: + peft_config = self.peft_config[adapter_name] + # save only the trainable weights + output_state_dict = get_peft_model_state_dict( + self, + state_dict=kwargs.get("state_dict", None), + adapter_name=adapter_name, + save_embedding_layers=save_embedding_layers, + ) + output_dir = os.path.join(save_directory, adapter_name) if adapter_name != "default" else save_directory + os.makedirs(output_dir, exist_ok=True) + + if is_main_process and safe_serialization: + # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134 + # Safetensors does not allow tensor aliasing. + # We're going to remove aliases before saving + ptrs = collections.defaultdict(list) + for name, tensor in output_state_dict.items(): + # Sometimes in the state_dict we have non-tensor objects. + # e.g. in bitsandbytes we have some `str` objects in the state_dict + if isinstance(tensor, torch.Tensor): + ptrs[id_tensor_storage(tensor)].append(name) + else: + # In the non-tensor case, fall back to the pointer of the object itself + ptrs[id(tensor)].append(name) + + # These are all the pointers of shared tensors. + shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1} + + for _, names in shared_ptrs.items(): + # Here we just clone the shared tensors to avoid tensor aliasing which is + # not supported in safetensors. + for shared_tensor_name in names[1:]: + output_state_dict[shared_tensor_name] = output_state_dict[shared_tensor_name].clone() + + safe_save_file( + output_state_dict, + os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME), + metadata={"format": "pt"}, + ) + elif is_main_process: + torch.save(output_state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + + # save the config and change the inference mode to `True` + if peft_config.base_model_name_or_path is None: + peft_config.base_model_name_or_path = ( + self.base_model.__dict__.get("name_or_path", None) + if peft_config.is_prompt_learning + else self.base_model.model.__dict__.get("name_or_path", None) + ) + inference_mode = peft_config.inference_mode + peft_config.inference_mode = True + + if peft_config.task_type is None: + # deal with auto mapping + base_model_class = self._get_base_model_class( + is_prompt_tuning=peft_config.is_prompt_learning, + ) + parent_library = base_model_class.__module__ + + auto_mapping_dict = { + "base_model_class": base_model_class.__name__, + "parent_library": parent_library, + } + else: + auto_mapping_dict = None + + if is_main_process: + peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict) + peft_config.inference_mode = inference_mode + + @classmethod + def from_pretrained( + cls, + model: torch.nn.Module, + model_id: Union[str, os.PathLike], + adapter_name: str = "default", + is_trainable: bool = False, + config: Optional[PeftConfig] = None, + **kwargs: Any, + ) -> PeftModel: + r""" + Instantiate a PEFT model from a pretrained model and loaded PEFT weights. + + Note that the passed `model` may be modified inplace. + + Args: + model ([`torch.nn.Module`]): + The model to be adapted. For πŸ€— Transformers models, the model should be initialized with the + [`~transformers.PreTrainedModel.from_pretrained`]. + model_id (`str` or `os.PathLike`): + The name of the PEFT configuration to use. Can be either: + - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face + Hub. + - A path to a directory containing a PEFT configuration file saved using the `save_pretrained` + method (`./my_peft_config_directory/`). + adapter_name (`str`, *optional*, defaults to `"default"`): + The name of the adapter to be loaded. This is useful for loading multiple adapters. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be + used for inference. + config ([`~peft.PeftConfig`], *optional*): + The configuration object to use instead of an automatically loaded configuration. This configuration + object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already + loaded before calling `from_pretrained`. + kwargs: (`optional`): + Additional keyword arguments passed along to the specific PEFT configuration class. + """ + from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING + + # load the config + if config is None: + config = PEFT_TYPE_TO_CONFIG_MAPPING[ + PeftConfig._get_peft_type( + model_id, + subfolder=kwargs.get("subfolder", None), + revision=kwargs.get("revision", None), + cache_dir=kwargs.get("cache_dir", None), + use_auth_token=kwargs.get("use_auth_token", None), + token=kwargs.get("token", None), + ) + ].from_pretrained(model_id, **kwargs) + elif isinstance(config, PeftConfig): + config.inference_mode = not is_trainable + else: + raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}") + + if (getattr(model, "hf_device_map", None) is not None) and len( + set(model.hf_device_map.values()).intersection({"cpu", "disk"}) + ) > 0: + remove_hook_from_submodules(model) + + if config.is_prompt_learning and is_trainable: + raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") + else: + config.inference_mode = not is_trainable + + if config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys(): + model = cls(model, config, adapter_name) + else: + model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type](model, config, adapter_name) + model.load_adapter(model_id, adapter_name, is_trainable=is_trainable, **kwargs) + return model + + def _setup_prompt_encoder(self, adapter_name: str): + config = self.peft_config[adapter_name] + if not hasattr(self, "prompt_encoder"): + self.prompt_encoder = torch.nn.ModuleDict({}) + self.prompt_tokens = {} + transformer_backbone = None + for name, module in self.base_model.named_children(): + for param in module.parameters(): + param.requires_grad = False + if isinstance(module, PreTrainedModel): + # Make sure to freeze Tranformers model + if transformer_backbone is None: + transformer_backbone = module + self.transformer_backbone_name = name + if transformer_backbone is None: + transformer_backbone = self.base_model + + if config.num_transformer_submodules is None: + config.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1 + + for named_param, value in list(transformer_backbone.named_parameters()): + # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape [0] + # the actual unsharded shape is stored in "ds_shape" attribute + # special handling is needed in case the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig + # has been called before + # For reference refer to issue: https://github.com/huggingface/peft/issues/996 + deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None) + + if value.shape[0] == self.base_model.config.vocab_size or ( + deepspeed_distributed_tensor_shape is not None + and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size + ): + self.word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", "")) + break + + if config.peft_type == PeftType.PROMPT_TUNING: + prompt_encoder = PromptEmbedding(config, self.word_embeddings) + elif config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: + prompt_encoder = MultitaskPromptEmbedding(config, self.word_embeddings) + elif config.peft_type == PeftType.P_TUNING: + prompt_encoder = PromptEncoder(config) + elif config.peft_type == PeftType.PREFIX_TUNING: + prompt_encoder = PrefixEncoder(config) + else: + raise ValueError("Not supported") + + prompt_encoder = prompt_encoder.to(self.device) + self.prompt_encoder.update(torch.nn.ModuleDict({adapter_name: prompt_encoder})) + self.prompt_tokens[adapter_name] = torch.arange( + config.num_virtual_tokens * config.num_transformer_submodules + ).long() + + def _prepare_model_for_gradient_checkpointing(self, model: PreTrainedModel): + r""" + Prepares the model for gradient checkpointing if necessary + """ + if not ( + getattr(model, "is_loaded_in_8bit", False) + or getattr(model, "is_loaded_in_4bit", False) + or getattr(model, "is_quantized", False) + ): + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + elif hasattr(model, "get_input_embeddings"): + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + return model + + def get_prompt_embedding_to_save(self, adapter_name: str) -> torch.Tensor: + """ + Returns the prompt embedding to save when saving the model. Only applicable when using a prompt learning + method. + """ + prompt_encoder = self.prompt_encoder[adapter_name] + prompt_tokens = ( + self.prompt_tokens[adapter_name].unsqueeze(0).expand(1, -1).to(prompt_encoder.embedding.weight.device) + ) + if self.peft_config[adapter_name].peft_type == PeftType.PREFIX_TUNING: + prompt_tokens = prompt_tokens[:, : self.peft_config[adapter_name].num_virtual_tokens] + + if self.peft_config[adapter_name].peft_type == PeftType.MULTITASK_PROMPT_TUNING: + prompt_embeddings = super(MultitaskPromptEmbedding, prompt_encoder).forward(prompt_tokens) + else: + prompt_embeddings = prompt_encoder(prompt_tokens) + + return prompt_embeddings[0].detach().cpu() + + def get_prompt(self, batch_size: int, task_ids: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + Returns the virtual prompts to use for Peft. Only applicable when using a prompt learning method. + """ + peft_config = self.active_peft_config + prompt_encoder = self.prompt_encoder[self.active_adapter] + prompt_tokens = ( + self.prompt_tokens[self.active_adapter] + .unsqueeze(0) + .expand(batch_size, -1) + .to(prompt_encoder.embedding.weight.device) + ) + if peft_config.peft_type == PeftType.PREFIX_TUNING: + prompt_tokens = prompt_tokens[:, : peft_config.num_virtual_tokens] + if peft_config.inference_mode: + past_key_values = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1) + else: + past_key_values = prompt_encoder(prompt_tokens) + if self.base_model_torch_dtype is not None: + past_key_values = past_key_values.to(self.base_model_torch_dtype) + past_key_values = past_key_values.view( + batch_size, + peft_config.num_virtual_tokens, + peft_config.num_layers * 2, + peft_config.num_attention_heads, + peft_config.token_dim // peft_config.num_attention_heads, + ) + if peft_config.num_transformer_submodules == 2: + past_key_values = torch.cat([past_key_values, past_key_values], dim=2) + past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split( + peft_config.num_transformer_submodules * 2 + ) + if TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING.get(self.config.model_type, None) is not None: + post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type] + past_key_values = post_process_fn(past_key_values) + return past_key_values + else: + if peft_config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: + prompts = prompt_encoder(prompt_tokens, task_ids) + else: + if peft_config.inference_mode: + prompts = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1) + else: + prompts = prompt_encoder(prompt_tokens) + return prompts + + def get_nb_trainable_parameters(self) -> tuple[int, int]: + r""" + Returns the number of trainable parameters and the number of all parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in self.named_parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + num_params = param.ds_numel + + # Due to the design of 4bit linear layers from bitsandbytes + # one needs to multiply the number of parameters by 2 to get + # the correct number of parameters + if param.__class__.__name__ == "Params4bit": + num_params = num_params * 2 + + all_param += num_params + if param.requires_grad: + trainable_params += num_params + + return trainable_params, all_param + + def print_trainable_parameters(self) -> None: + """ + Prints the number of trainable parameters in the model. + """ + trainable_params, all_param = self.get_nb_trainable_parameters() + + print( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param}" + ) + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.base_model, name) + + def forward(self, *args: Any, **kwargs: Any): + """ + Forward pass of the model. + """ + return self.get_base_model()(*args, **kwargs) + + def _get_base_model_class(self, is_prompt_tuning=False): + """ + Returns the base model class. + """ + if not is_prompt_tuning: + return self.base_model.model.__class__ + return self.base_model.__class__ + + @contextmanager + def disable_adapter(self): + """ + Context manager that disables the adapter module. Use this to run inference on the base model. + + Example: + + ```py + >>> with model.disable_adapter(): + ... model(inputs) + ``` + """ + try: + if self.peft_config[self.active_adapter].is_prompt_learning: + # TODO: consider replacing this patching of methods with a more robust mechanism: setting a flag and + # letting the underlying methods deal with it, same as how LoRA does it. + old_forward = self.forward + self.forward = self.base_model.forward + old_prepare_inputs_for_generation = self.prepare_inputs_for_generation + self.prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation + else: + self.base_model.disable_adapter_layers() + yield + finally: + if self.peft_config[self.active_adapter].is_prompt_learning: + self.forward = old_forward + self.prepare_inputs_for_generation = old_prepare_inputs_for_generation + else: + self.base_model.enable_adapter_layers() + + def get_base_model(self) -> torch.nn.Module: + """ + Returns the base model. + """ + return ( + self.base_model + if (self.active_peft_config.is_prompt_learning or self.peft_type == PeftType.POLY) + else self.base_model.model + ) + + def add_adapter(self, adapter_name: str, peft_config: PeftConfig) -> None: + """ + Add an adapter to the model based on the passed configuration. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + adapter_name (`str`): + The name of the adapter to be added. + peft_config ([`PeftConfig`]): + The configuration of the adapter to be added. + """ + if peft_config.peft_type != self.peft_type: + raise ValueError( + f"Cannot combine adapters with different peft types. " + f"Found {self.peft_type} and {peft_config.peft_type}." + ) + + try: + if peft_config.is_prompt_learning: + self.peft_config[adapter_name] = peft_config + if hasattr(self.config, "to_dict"): + dict_config = self.config.to_dict() + else: + dict_config = self.config + + peft_config = _prepare_prompt_learning_config(peft_config, dict_config) + self._setup_prompt_encoder(adapter_name) + elif peft_config.is_adaption_prompt: + self.base_model.add_adapter(adapter_name, peft_config) + else: + self.peft_config[adapter_name] = peft_config + self.base_model.inject_adapter(self.base_model.model, adapter_name) + except Exception: # something went wrong, roll back + if adapter_name in self.peft_config: + del self.peft_config[adapter_name] + raise + + self.set_additional_trainable_modules(peft_config, adapter_name) + + def set_additional_trainable_modules(self, peft_config, adapter_name): + if getattr(peft_config, "modules_to_save", None) is not None: + if self.modules_to_save is None: + self.modules_to_save = set(peft_config.modules_to_save) + else: + self.modules_to_save.update(peft_config.modules_to_save) + _set_trainable(self, adapter_name) + + @classmethod + def _split_kwargs(cls, kwargs: dict[str, Any]): + _kwargs_not_in_hf_hub_download_signature = ("use_auth_token",) + hf_hub_download_kwargs = {} + other_kwargs = {} + + for key, value in kwargs.items(): + if key in inspect.signature(hf_hub_download).parameters or key in _kwargs_not_in_hf_hub_download_signature: + hf_hub_download_kwargs[key] = value + else: + other_kwargs[key] = value + + return hf_hub_download_kwargs, other_kwargs + + def load_adapter(self, model_id: str, adapter_name: str, is_trainable: bool = False, **kwargs: Any): + """ + Load a trained adapter into the model. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + adapter_name (`str`): + The name of the adapter to be added. + peft_config ([`PeftConfig`]): + The configuration of the adapter to be added. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be + used for inference. + kwargs: (`optional`): + Additional arguments to modify the way the adapter is loaded, e.g. the token for Hugging Face Hub. + """ + from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING + + hf_hub_download_kwargs, kwargs = self._split_kwargs(kwargs) + torch_device = infer_device() + + if adapter_name not in self.peft_config: + # load the config + peft_config = PEFT_TYPE_TO_CONFIG_MAPPING[ + PeftConfig._get_peft_type( + model_id, + **hf_hub_download_kwargs, + ) + ].from_pretrained( + model_id, + **hf_hub_download_kwargs, + ) + if peft_config.is_prompt_learning and is_trainable: + raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") + else: + peft_config.inference_mode = not is_trainable + self.add_adapter(adapter_name, peft_config) + + adapters_weights = load_peft_weights(model_id, device=torch_device, **hf_hub_download_kwargs) + + # load the weights into the model + load_result = set_peft_model_state_dict(self, adapters_weights, adapter_name=adapter_name) + if ( + (getattr(self, "hf_device_map", None) is not None) + and (len(set(self.hf_device_map.values()).intersection({"cpu", "disk"})) > 0) + and len(self.peft_config) == 1 + ): + device_map = kwargs.get("device_map", "auto") + max_memory = kwargs.get("max_memory", None) + offload_dir = kwargs.get("offload_folder", None) + offload_index = kwargs.get("offload_index", None) + + dispatch_model_kwargs = {} + # Safety checker for previous `accelerate` versions + # `offload_index` was introduced in https://github.com/huggingface/accelerate/pull/873/ + if "offload_index" in inspect.signature(dispatch_model).parameters: + dispatch_model_kwargs["offload_index"] = offload_index + + no_split_module_classes = self._no_split_modules + + if device_map != "sequential": + max_memory = get_balanced_memory( + self, + max_memory=max_memory, + no_split_module_classes=no_split_module_classes, + low_zero=(device_map == "balanced_low_0"), + ) + if isinstance(device_map, str): + device_map = infer_auto_device_map( + self, max_memory=max_memory, no_split_module_classes=no_split_module_classes + ) + dispatch_model( + self, + device_map=device_map, + offload_dir=offload_dir, + **dispatch_model_kwargs, + ) + hook = AlignDevicesHook(io_same_device=True) + if self.peft_config[adapter_name].is_prompt_learning: + remove_hook_from_submodules(self.prompt_encoder) + add_hook_to_module(self.get_base_model(), hook) + + # Set model in evaluation mode to deactivate Dropout modules by default + if not is_trainable: + self.eval() + return load_result + + def set_adapter(self, adapter_name: str) -> None: + """ + Sets the active adapter. + + Only one adapter can be active at a time. + + Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (`str`): + The name of the adapter to be set as active. The adapter must be loaded first. + """ + if adapter_name not in self.peft_config: + raise ValueError(f"Adapter {adapter_name} not found.") + self.active_adapter = adapter_name + if not self.peft_config[adapter_name].is_prompt_learning: + self.base_model.set_adapter(adapter_name) + _set_adapter(self, adapter_name) + + @property + def base_model_torch_dtype(self): + return getattr(self.base_model, "dtype", None) + + @property + def active_peft_config(self): + return self.peft_config[self.active_adapter] + + def create_or_update_model_card(self, output_dir: str): + """ + Updates or create model card to include information about peft: + 1. Adds `peft` library tag + 2. Adds peft version + 3. Adds base model info + 4. Adds quantization information if it was used + """ + + filename = os.path.join(output_dir, "README.md") + + card = ModelCard.load(filename) if os.path.exists(filename) else ModelCard.from_template(ModelCardData()) + + card.data["library_name"] = "peft" + + model_config = getattr(self, "config", None) + if hasattr(model_config, "to_dict"): + model_config = model_config.to_dict() + if model_config is not None and "_name_or_path" in model_config: + card.data["base_model"] = model_config["_name_or_path"] + + lines = card.text.splitlines() + + quantization_config = None + if hasattr(model_config, "quantization_config"): + quantization_config = self.config.quantization_config.to_dict() + training_config_text = "" + quantization_prefix = "The following `bitsandbytes` quantization config was used during training:" + # Adds quantization information if it was used + if quantization_config is not None: + training_config_text += f"\n{quantization_prefix}\n" + training_config_text += "\n".join([f"- {name}: {value}" for name, value in quantization_config.items()]) + training_config_text += "\n" + + training_procedure_heading = "## Training procedure" + if quantization_prefix not in lines and bool(training_config_text): + if training_procedure_heading in lines: + lines.insert(lines.index(training_procedure_heading) + 2, training_config_text) + else: + lines.append(f"{training_procedure_heading}\n{training_config_text}") + + # Adds peft version + framework_block_heading = "### Framework versions" + if f"- PEFT {__version__}" not in lines: + if framework_block_heading in lines: + lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}") + else: + lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}") + + card.text = "\n".join(lines) + card.save(filename) + + +class PeftModelForSequenceClassification(PeftModel): + """ + Peft model for sequence classification tasks. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + + **Attributes**: + - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model. + - **cls_layer_name** (`str`) -- The name of the classification layer. + + Example: + + ```py + >>> from transformers import AutoModelForSequenceClassification + >>> from peft import PeftModelForSequenceClassification, get_peft_config + + >>> config = { + ... "peft_type": "PREFIX_TUNING", + ... "task_type": "SEQ_CLS", + ... "inference_mode": False, + ... "num_virtual_tokens": 20, + ... "token_dim": 768, + ... "num_transformer_submodules": 1, + ... "num_attention_heads": 12, + ... "num_layers": 12, + ... "encoder_hidden_size": 768, + ... "prefix_projection": False, + ... "postprocess_past_key_value_function": None, + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased") + >>> peft_model = PeftModelForSequenceClassification(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 370178 || all params: 108680450 || trainable%: 0.3406113979101117 + ``` + """ + + def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None: + super().__init__(model, peft_config, adapter_name) + if self.modules_to_save is None: + self.modules_to_save = {"classifier", "score"} + else: + self.modules_to_save.update({"classifier", "score"}) + + for name, _ in self.base_model.named_children(): + if any(module_name in name for module_name in self.modules_to_save): + self.cls_layer_name = name + break + + # to make sure classifier layer is trainable + _set_trainable(self, adapter_name) + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + peft_config = self.active_peft_config + if not peft_config.is_prompt_learning: + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "labels": labels, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + return self._prefix_tuning_forward(input_ids=input_ids, **kwargs) + else: + if kwargs.get("token_type_ids", None) is not None: + kwargs["token_type_ids"] = torch.cat( + ( + torch.zeros(batch_size, peft_config.num_virtual_tokens).to(self.word_embeddings.weight.device), + kwargs["token_type_ids"], + ), + dim=1, + ).long() + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + def _prefix_tuning_forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + batch_size = _get_batch_size(input_ids, inputs_embeds) + past_key_values = self.get_prompt(batch_size) + fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys()) + kwargs.update( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "inputs_embeds": inputs_embeds, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + "past_key_values": past_key_values, + } + ) + if "past_key_values" in fwd_params: + return self.base_model(labels=labels, **kwargs) + else: + transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name) + fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys()) + if "past_key_values" not in fwd_params: + raise ValueError("Model does not support past key values which are required for prefix tuning.") + outputs = transformer_backbone_name(**kwargs) + pooled_output = outputs[1] if len(outputs) > 1 else outputs[0] + if "dropout" in [name for name, _ in list(self.base_model.named_children())]: + pooled_output = self.base_model.dropout(pooled_output) + logits = self.base_model.get_submodule(self.cls_layer_name)(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.base_model.num_labels == 1: + self.config.problem_type = "regression" + elif self.base_model.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.base_model.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.base_model.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class PeftModelForCausalLM(PeftModel): + """ + Peft model for causal language modeling. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import PeftModelForCausalLM, get_peft_config + + >>> config = { + ... "peft_type": "PREFIX_TUNING", + ... "task_type": "CAUSAL_LM", + ... "inference_mode": False, + ... "num_virtual_tokens": 20, + ... "token_dim": 1280, + ... "num_transformer_submodules": 1, + ... "num_attention_heads": 20, + ... "num_layers": 36, + ... "encoder_hidden_size": 1280, + ... "prefix_projection": False, + ... "postprocess_past_key_value_function": None, + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForCausalLM.from_pretrained("gpt2-large") + >>> peft_model = PeftModelForCausalLM(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 1843200 || all params: 775873280 || trainable%: 0.23756456724479544 + ``` + """ + + def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None: + super().__init__(model, peft_config, adapter_name) + self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + if not peft_config.is_prompt_learning: + if self.base_model.config.model_type == "mpt": + if inputs_embeds is not None: + raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds") + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + if kwargs.get("token_type_ids", None) is not None: + warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids") + kwargs["token_type_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "labels": labels, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + past_key_values = self.get_prompt(batch_size) + return self.base_model( + input_ids=input_ids, inputs_embeds=inputs_embeds, past_key_values=past_key_values, **kwargs + ) + else: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + # concat prompt labels + if labels is not None: + prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device) + kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1) + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + def generate(self, *args, **kwargs): + self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation + if hasattr(self.base_model, "model"): + self.base_model.model.generation_config = self.generation_config + else: + self.base_model.generation_config = self.generation_config + try: + outputs = self.base_model.generate(*args, **kwargs) + except: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + raise + else: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + return outputs + + def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor] = None, **kwargs): + peft_config = self.active_peft_config + model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs) + + # https://github.com/huggingface/transformers/pull/26681/ introduced new cache format + # for some architectures which requires a special fix for prompt tuning etc. + # TODO: starting with transformers 4.38, all architectures should support caching. + uses_transformers_4_38 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.38.0") + uses_transformers_4_36 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.36.0") + transformers_new_cache_archs = ["llama", "mistral", "persimmon", "phi"] + uses_cache = uses_transformers_4_38 or ( + uses_transformers_4_36 and self.base_model.config.model_type in transformers_new_cache_archs + ) + + if peft_config.peft_type == PeftType.POLY: + model_kwargs["task_ids"] = task_ids + if peft_config.is_prompt_learning: + if uses_cache and (model_kwargs["past_key_values"] is not None): + # change in the logic of `prepare_inputs_for_generation` makes the below code necessary + # In prompt learning methods, past key values are longer when compared to the `input_ids`. + # As such only consider the last input ids in the autogressive generation phase. + if model_kwargs["past_key_values"][0][0].shape[-2] >= model_kwargs["input_ids"].shape[1]: + model_kwargs["input_ids"] = model_kwargs["input_ids"][:, -1:] + + if model_kwargs.get("attention_mask", None) is not None: + size = model_kwargs["input_ids"].shape[0], peft_config.num_virtual_tokens + prefix_attention_mask = torch.ones(size).to(model_kwargs["input_ids"].device) + model_kwargs["attention_mask"] = torch.cat( + (prefix_attention_mask, model_kwargs["attention_mask"]), dim=1 + ) + + if model_kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + model_kwargs["position_ids"] = None + + if kwargs.get("token_type_ids", None) is not None: + warnings.warn( + "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids" + ) + kwargs["token_type_ids"] = None + + if model_kwargs["past_key_values"] is None and peft_config.peft_type == PeftType.PREFIX_TUNING: + past_key_values = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0]) + model_kwargs["past_key_values"] = past_key_values + else: + if model_kwargs["past_key_values"] is None: + inputs_embeds = self.word_embeddings(model_kwargs["input_ids"]) + prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0], task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + model_kwargs["inputs_embeds"] = torch.cat((prompts, inputs_embeds), dim=1) + model_kwargs["input_ids"] = None + + # For transformers>=4.38.0 - for some architectures such as Llama, `cache_position` is + # passed in the forward pass to keep track of the position ids of the cache. We have to + # pop that from `model_kwargs` as `cache_position` is properly created by the model, using the passed + # `inputs_embeds`: https://github.com/huggingface/transformers/blob/593230f0a1150ea9c0477b9d859f25daf73c8c33/src/transformers/models/llama/modeling_llama.py#L956 + _ = model_kwargs.pop("cache_position", None) + + return model_kwargs + + +class PeftModelForSeq2SeqLM(PeftModel): + """ + Peft model for sequence-to-sequence language modeling. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + + + Example: + + ```py + >>> from transformers import AutoModelForSeq2SeqLM + >>> from peft import PeftModelForSeq2SeqLM, get_peft_config + + >>> config = { + ... "peft_type": "LORA", + ... "task_type": "SEQ_2_SEQ_LM", + ... "inference_mode": False, + ... "r": 8, + ... "target_modules": ["q", "v"], + ... "lora_alpha": 32, + ... "lora_dropout": 0.1, + ... "fan_in_fan_out": False, + ... "enable_lora": None, + ... "bias": "none", + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> peft_model = PeftModelForSeq2SeqLM(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 884736 || all params: 223843584 || trainable%: 0.3952474242013566 + ``` + """ + + def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None: + super().__init__(model, peft_config, adapter_name) + self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation + self.base_model_prepare_encoder_decoder_kwargs_for_generation = ( + self.base_model._prepare_encoder_decoder_kwargs_for_generation + ) + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + decoder_input_ids=None, + decoder_attention_mask=None, + decoder_inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + if not peft_config.is_prompt_learning: + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_inputs_embeds=decoder_inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if decoder_attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to( + decoder_attention_mask.device + ) + if peft_config.peft_type not in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]: + decoder_attention_mask = torch.cat((prefix_attention_mask, decoder_attention_mask), dim=1) + + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + if kwargs.get("token_type_ids", None) is not None: + warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids") + kwargs["token_type_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "labels": labels, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + past_key_values = self.get_prompt(batch_size) + return self.base_model( + input_ids=input_ids, + decoder_input_ids=decoder_input_ids, + decoder_inputs_embeds=decoder_inputs_embeds, + past_key_values=past_key_values, + **kwargs, + ) + elif peft_config.peft_type in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to( + attention_mask.device + ) + kwargs["attention_mask"] = torch.cat((prefix_attention_mask, attention_mask), dim=1) + + prompts = self.get_prompt(batch_size=batch_size) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1) + + return self.base_model( + inputs_embeds=inputs_embeds, + decoder_input_ids=decoder_input_ids, + decoder_inputs_embeds=decoder_inputs_embeds, + **kwargs, + ) + else: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + if decoder_inputs_embeds is None and decoder_input_ids is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + decoder_inputs_embeds = self.word_embeddings(decoder_input_ids) + + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to( + attention_mask.device + ) + kwargs["attention_mask"] = torch.cat((prefix_attention_mask, attention_mask), dim=1) + # concat prompt labels + if labels is not None: + if peft_config.num_transformer_submodules == 1: + kwargs["labels"] = labels + elif peft_config.num_transformer_submodules == 2: + prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device) + kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1) + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1) + if peft_config.num_transformer_submodules == 1: + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + elif peft_config.num_transformer_submodules == 2: + decoder_inputs_embeds = torch.cat( + (prompts[:, peft_config.num_virtual_tokens :], decoder_inputs_embeds), dim=1 + ) + return self.base_model( + inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, **kwargs + ) + + def generate(self, **kwargs): + peft_config = self.active_peft_config + self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation + self.base_model._prepare_encoder_decoder_kwargs_for_generation = ( + self._prepare_encoder_decoder_kwargs_for_generation + ) + try: + if not peft_config.is_prompt_learning: + outputs = self.base_model.generate(**kwargs) + else: + if "input_ids" not in kwargs: + raise ValueError("input_ids must be provided for Peft model generation") + if kwargs.get("position_ids", None) is not None: + warnings.warn( + "Position ids are not supported for parameter efficient tuning. Ignoring position ids." + ) + kwargs["position_ids"] = None + if kwargs.get("token_type_ids", None) is not None: + warnings.warn( + "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids" + ) + kwargs["token_type_ids"] = None + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + outputs = self.base_model.generate(**kwargs) + elif peft_config.peft_type in [ + PeftType.PROMPT_TUNING, + PeftType.P_TUNING, + PeftType.MULTITASK_PROMPT_TUNING, + ]: + kwargs = deepcopy(kwargs) + + if "encoder_outputs" in kwargs: + del kwargs["encoder_outputs"] + warnings.warn( + "`encoder_outputs` should not be passed to `generate` when using prompt tuning. Ignoring it." + ) + + input_ids = kwargs.pop("input_ids") + inputs_embeds = self.word_embeddings(input_ids) + batch_size = inputs_embeds.shape[0] + prompts = self.get_prompt(batch_size=batch_size, task_ids=kwargs.pop("task_ids", None)) + prompts = prompts.to(inputs_embeds.dtype) + + inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1) + kwargs["inputs_embeds"] = inputs_embeds + + if "attention_mask" in kwargs: + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to( + kwargs["attention_mask"].device + ) + kwargs["attention_mask"] = torch.cat((prefix_attention_mask, kwargs["attention_mask"]), dim=1) + + return self.base_model.generate(**kwargs) + else: + raise NotImplementedError + except: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + self.base_model._prepare_encoder_decoder_kwargs_for_generation = ( + self.base_model_prepare_encoder_decoder_kwargs_for_generation + ) + raise + else: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + self.base_model._prepare_encoder_decoder_kwargs_for_generation = ( + self.base_model_prepare_encoder_decoder_kwargs_for_generation + ) + return outputs + + def prepare_inputs_for_generation(self, *args, task_ids: torch.Tensor = None, **kwargs): + peft_config = self.active_peft_config + model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs) + if peft_config.peft_type == PeftType.POLY: + model_kwargs["task_ids"] = task_ids + if model_kwargs["past_key_values"] is None and peft_config.peft_type == PeftType.PREFIX_TUNING: + batch_size = model_kwargs["decoder_input_ids"].shape[0] + past_key_values = self.get_prompt(batch_size) + model_kwargs["past_key_values"] = past_key_values + + return model_kwargs + + +class PeftModelForTokenClassification(PeftModel): + """ + Peft model for token classification tasks. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + + **Attributes**: + - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model. + - **cls_layer_name** (`str`) -- The name of the classification layer. + + Example: + + ```py + >>> from transformers import AutoModelForSequenceClassification + >>> from peft import PeftModelForTokenClassification, get_peft_config + + >>> config = { + ... "peft_type": "PREFIX_TUNING", + ... "task_type": "TOKEN_CLS", + ... "inference_mode": False, + ... "num_virtual_tokens": 20, + ... "token_dim": 768, + ... "num_transformer_submodules": 1, + ... "num_attention_heads": 12, + ... "num_layers": 12, + ... "encoder_hidden_size": 768, + ... "prefix_projection": False, + ... "postprocess_past_key_value_function": None, + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForTokenClassification.from_pretrained("bert-base-cased") + >>> peft_model = PeftModelForTokenClassification(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 370178 || all params: 108680450 || trainable%: 0.3406113979101117 + ``` + """ + + def __init__(self, model: torch.nn.Module, peft_config: PeftConfig = None, adapter_name: str = "default") -> None: + super().__init__(model, peft_config, adapter_name) + if self.modules_to_save is None: + self.modules_to_save = {"classifier", "score"} + else: + self.modules_to_save.update({"classifier", "score"}) + + for name, _ in self.base_model.named_children(): + if any(module_name in name for module_name in self.modules_to_save): + self.cls_layer_name = name + break + + # to make sure classifier layer is trainable + _set_trainable(self, adapter_name) + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if not peft_config.is_prompt_learning: + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "labels": labels, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + return self._prefix_tuning_forward(input_ids=input_ids, **kwargs) + else: + if kwargs.get("token_type_ids", None) is not None: + kwargs["token_type_ids"] = torch.cat( + ( + torch.zeros(batch_size, peft_config.num_virtual_tokens).to(self.word_embeddings.weight.device), + kwargs["token_type_ids"], + ), + dim=1, + ).long() + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + def _prefix_tuning_forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + batch_size = _get_batch_size(input_ids, inputs_embeds) + past_key_values = self.get_prompt(batch_size) + fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys()) + kwargs.update( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "inputs_embeds": inputs_embeds, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + "past_key_values": past_key_values, + } + ) + if "past_key_values" in fwd_params: + return self.base_model(labels=labels, **kwargs) + else: + transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name) + fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys()) + if "past_key_values" not in fwd_params: + raise ValueError("Model does not support past key values which are required for prefix tuning.") + outputs = transformer_backbone_name(**kwargs) + sequence_output = outputs[0] + if "dropout" in [name for name, _ in list(self.base_model.named_children())]: + sequence_output = self.base_model.dropout(sequence_output) + logits = self.base_model.get_submodule(self.cls_layer_name)(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class PeftModelForQuestionAnswering(PeftModel): + """ + Peft model for extractive question answering. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + + **Attributes**: + - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model. + - **cls_layer_name** (`str`) -- The name of the classification layer. + + Example: + + ```py + >>> from transformers import AutoModelForQuestionAnswering + >>> from peft import PeftModelForQuestionAnswering, get_peft_config + + >>> config = { + ... "peft_type": "LORA", + ... "task_type": "QUESTION_ANS", + ... "inference_mode": False, + ... "r": 16, + ... "target_modules": ["query", "value"], + ... "lora_alpha": 32, + ... "lora_dropout": 0.05, + ... "fan_in_fan_out": False, + ... "bias": "none", + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-base-cased") + >>> peft_model = PeftModelForQuestionAnswering(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 592900 || all params: 108312580 || trainable%: 0.5473971721475013 + ``` + """ + + def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None: + super().__init__(model, peft_config, adapter_name) + if self.modules_to_save is None: + self.modules_to_save = {"qa_outputs"} + else: + self.modules_to_save.update({"qa_outputs"}) + + for name, _ in self.base_model.named_children(): + if any(module_name in name for module_name in self.modules_to_save): + self.cls_layer_name = name + break + + # to make sure classifier layer is trainable + _set_trainable(self, adapter_name) + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if not peft_config.is_prompt_learning: + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + start_positions=start_positions, + end_positions=end_positions, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "start_positions": start_positions, + "end_positions": end_positions, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + return self._prefix_tuning_forward(input_ids=input_ids, **kwargs) + else: + if kwargs.get("token_type_ids", None) is not None: + kwargs["token_type_ids"] = torch.cat( + ( + torch.zeros(batch_size, peft_config.num_virtual_tokens).to(self.word_embeddings.weight.device), + kwargs["token_type_ids"], + ), + dim=1, + ).long() + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + prompts = self.get_prompt(batch_size=batch_size) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + def _prefix_tuning_forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + batch_size = _get_batch_size(input_ids, inputs_embeds) + past_key_values = self.get_prompt(batch_size) + fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys()) + kwargs.update( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "inputs_embeds": inputs_embeds, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + "past_key_values": past_key_values, + } + ) + if "past_key_values" in fwd_params: + return self.base_model(start_positions=start_positions, end_positions=end_positions, **kwargs) + else: + transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name) + fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys()) + if "past_key_values" not in fwd_params: + raise ValueError("Model does not support past key values which are required for prefix tuning.") + outputs = transformer_backbone_name(**kwargs) + sequence_output = outputs[0] + if "dropout" in [name for name, _ in list(self.base_model.named_children())]: + sequence_output = self.base_model.dropout(sequence_output) + logits = self.base_model.get_submodule(self.cls_layer_name)(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class PeftModelForFeatureExtraction(PeftModel): + """ + Peft model for extracting features/embeddings from transformer models + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + + **Attributes**: + - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model. + + Example: + + ```py + >>> from transformers import AutoModel + >>> from peft import PeftModelForFeatureExtraction, get_peft_config + + >>> config = { + ... "peft_type": "LORA", + ... "task_type": "FEATURE_EXTRACTION", + ... "inference_mode": False, + ... "r": 16, + ... "target_modules": ["query", "value"], + ... "lora_alpha": 32, + ... "lora_dropout": 0.05, + ... "fan_in_fan_out": False, + ... "bias": "none", + ... } + >>> peft_config = get_peft_config(config) + >>> model = AutoModel.from_pretrained("bert-base-cased") + >>> peft_model = PeftModelForFeatureExtraction(model, peft_config) + >>> peft_model.print_trainable_parameters() + ``` + """ + + def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default"): + super().__init__(model, peft_config, adapter_name) + + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + peft_config = self.active_peft_config + if not peft_config.is_prompt_learning: + if peft_config.peft_type == PeftType.POLY: + kwargs["task_ids"] = task_ids + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + if kwargs.get("token_type_ids", None) is not None: + warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids") + kwargs["token_type_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + past_key_values = self.get_prompt(batch_size) + return self.base_model(input_ids=input_ids, past_key_values=past_key_values, **kwargs) + else: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + prompts = self.get_prompt(batch_size=batch_size) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) diff --git a/MoRA/peft_mora/py.typed b/MoRA/peft_mora/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MoRA/peft_mora/tuners/__init__.py b/MoRA/peft_mora/tuners/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b47baa668177ec80b3ec142f1555c5b90f13dcca --- /dev/null +++ b/MoRA/peft_mora/tuners/__init__.py @@ -0,0 +1,32 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all + +# coding=utf-8 +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel +from .lora import LoraConfig, LoraModel, LoftQConfig +from .loha import LoHaConfig, LoHaModel +from .lokr import LoKrConfig, LoKrModel +from .ia3 import IA3Config, IA3Model +from .adalora import AdaLoraConfig, AdaLoraModel +from .p_tuning import PromptEncoder, PromptEncoderConfig, PromptEncoderReparameterizationType +from .prefix_tuning import PrefixEncoder, PrefixTuningConfig +from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit +from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit +from .oft import OFTConfig, OFTModel +from .mixed import MixedModel +from .poly import PolyConfig, PolyModel diff --git a/MoRA/peft_mora/tuners/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29d5d895e7fa6671b6cd6268273a65d5b2e69f6a Binary files /dev/null and b/MoRA/peft_mora/tuners/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/__pycache__/lycoris_utils.cpython-312.pyc b/MoRA/peft_mora/tuners/__pycache__/lycoris_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8fceaf8c5fbd020a8a698de4100be119633135e6 Binary files /dev/null and b/MoRA/peft_mora/tuners/__pycache__/lycoris_utils.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/__pycache__/tuners_utils.cpython-312.pyc b/MoRA/peft_mora/tuners/__pycache__/tuners_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16c1d89adf0b2513d2fd824ac4eefef8d4a185f7 Binary files /dev/null and b/MoRA/peft_mora/tuners/__pycache__/tuners_utils.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adalora/__init__.py b/MoRA/peft_mora/tuners/adalora/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bf041fb128477e5c1906e4f82b37b3390e4b46b2 --- /dev/null +++ b/MoRA/peft_mora/tuners/adalora/__init__.py @@ -0,0 +1,37 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available + +from .config import AdaLoraConfig +from .gptq import SVDQuantLinear +from .layer import AdaLoraLayer, RankAllocator, SVDLinear +from .model import AdaLoraModel + + +__all__ = ["AdaLoraConfig", "AdaLoraLayer", "AdaLoraModel", "SVDLinear", "RankAllocator", "SVDQuantLinear"] + + +def __getattr__(name): + if (name == "SVDLinear8bitLt") and is_bnb_available(): + from .bnb import SVDLinear8bitLt + + return SVDLinear8bitLt + + if (name == "SVDLinear4bit") and is_bnb_4bit_available(): + from .bnb import SVDLinear4bit + + return SVDLinear4bit + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5372403ad2dd6ce319a9c592f59a67242c13d708 Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b419d8b5c67544c7dececa1af90d9d1180cad14 Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/gptq.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/gptq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6080380ad77ad89feaf6a09aa462e511fe81cded Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/gptq.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/layer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed4c21ce63102e8836e7032b378704dffcc6b22d Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/layer.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37aa099ab94a7abc2bd1e9d7e32ebf8953afa16f Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adalora/bnb.py b/MoRA/peft_mora/tuners/adalora/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..59ef4ae90863d4d8e708195628e88e3d8659fec2 --- /dev/null +++ b/MoRA/peft_mora/tuners/adalora/bnb.py @@ -0,0 +1,145 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import torch + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available + +from .layer import AdaLoraLayer + + +if is_bnb_available(): + + class SVDLinear8bitLt(torch.nn.Module, AdaLoraLayer): + # Low-rank matrix for SVD-based adaptation + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + AdaLoraLayer.__init__(self, base_layer) + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + # note: no check for self.merged because merging is not supported (yet) + result = self.base_layer(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + if x.dtype != torch.float32: + x = x.float() + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + lora_E = self.lora_E[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + ranknum = self.ranknum[active_adapter] + 1e-5 + + output = dropout(x) @ (lora_A * lora_E).T @ lora_B.T + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling / ranknum + # inplace operation on view is forbidden for MatMul8bitLtBackward, so avoid it + result = result + output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "adalora." + rep + + +if is_bnb_4bit_available(): + + class SVDLinear4bit(torch.nn.Module, AdaLoraLayer): + # Low-rank matrix for SVD-based adaptation + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + AdaLoraLayer.__init__(self, base_layer) + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + # note: no check for self.merged because merging is not supported (yet) + result = self.base_layer(x, *args, **kwargs) + + if self.disable_adapters: + return result + + # As per Tim Dettmers, for 4bit, we need to defensively clone here. + # The reason is that in some cases, an error can occur that backprop + # does not work on a manipulated view. This issue may be solved with + # newer PyTorch versions but this would need extensive testing to be + # sure. + result = result.clone() + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + lora_E = self.lora_E[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + ranknum = self.ranknum[active_adapter] + 1e-5 + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + compute_dtype = lora_A.dtype + if x.dtype != compute_dtype: + x = x.to(compute_dtype) + + output = dropout(x) @ (lora_A * lora_E).T @ lora_B.T + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling / ranknum + result += output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "adalora." + rep diff --git a/MoRA/peft_mora/tuners/adalora/config.py b/MoRA/peft_mora/tuners/adalora/config.py new file mode 100644 index 0000000000000000000000000000000000000000..d4595300a8ad41be2fc947580a6bd85459ed3259 --- /dev/null +++ b/MoRA/peft_mora/tuners/adalora/config.py @@ -0,0 +1,52 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import Optional + +from peft_mora.tuners.lora import LoraConfig +from peft_mora.utils import PeftType + + +@dataclass +class AdaLoraConfig(LoraConfig): + """ + This is the configuration class to store the configuration of a [`~peft.AdaLora`]. + + Args: + target_r (`int`): The target average rank of incremental matrix. + init_r (`int`): The initial rank for each incremental matrix. + tinit (`int`): The steps of initial fine-tuning warmup. + tfinal (`int`): The step of final fine-tuning. + deltaT (`int`): The time internval between two budget allocations. + beta1 (`float`): The hyperparameter of EMA for sensitivity smoothing. + beta2 (`float`): The hyperparameter of EMA for undertainty quantification. + orth_reg_weight (`float`): The coefficient of orthogonal regularization. + total_step (`int`): The total training steps that should be specified before training. + rank_pattern (`list`): The allocated rank for each weight matrix by RankAllocator. + """ + + target_r: int = field(default=8, metadata={"help": "Target Lora matrix dimension."}) + init_r: int = field(default=12, metadata={"help": "Initial Lora matrix dimension."}) + tinit: int = field(default=0, metadata={"help": "The steps of initial warmup."}) + tfinal: int = field(default=0, metadata={"help": "The steps of final warmup."}) + deltaT: int = field(default=1, metadata={"help": "Step interval of rank allocation."}) + beta1: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."}) + beta2: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."}) + orth_reg_weight: float = field(default=0.5, metadata={"help": "The orthogonal regularization coefficient."}) + total_step: Optional[int] = field(default=None, metadata={"help": "The total training steps."}) + rank_pattern: Optional[dict] = field(default=None, metadata={"help": "The saved rank pattern."}) + + def __post_init__(self): + self.peft_type = PeftType.ADALORA diff --git a/MoRA/peft_mora/tuners/adalora/gptq.py b/MoRA/peft_mora/tuners/adalora/gptq.py new file mode 100644 index 0000000000000000000000000000000000000000..910377c5db5908727ed4753fd15b24e68821ce00 --- /dev/null +++ b/MoRA/peft_mora/tuners/adalora/gptq.py @@ -0,0 +1,72 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +from .layer import AdaLoraLayer + + +class SVDQuantLinear(torch.nn.Module, AdaLoraLayer): + def __init__( + self, + base_layer, + adapter_name, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + AdaLoraLayer.__init__(self, base_layer) + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + result = self.quant_linear_module(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + lora_E = self.lora_E[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + ranknum = self.ranknum[active_adapter] + 1e-5 + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + if x.dtype != torch.float32: + x = x.float() + + output = (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum + # TODO: here, the dtype conversion is applied on the *whole expression*, + # not the intermediate result, unlike for SVDLinear8bitLT and + # SVDLinear4bit, is that correct? + if requires_conversion: + output = output.to(expected_dtype) + result += output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "adalora." + rep diff --git a/MoRA/peft_mora/tuners/adalora/layer.py b/MoRA/peft_mora/tuners/adalora/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..fc6835e9fbf67dae4fb751d1968b0a99387323bc --- /dev/null +++ b/MoRA/peft_mora/tuners/adalora/layer.py @@ -0,0 +1,346 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Any, List, Optional + +import torch +from torch import nn + +from peft_mora.tuners.lora import LoraLayer +from peft_mora.tuners.tuners_utils import check_adapters_to_merge +from peft_mora.utils import transpose + + +class AdaLoraLayer(LoraLayer): + # List all names of layers that may contain adapter weights + # Note: ranknum doesn't need to be included as it is not an nn.Module + adapter_layer_names = ("lora_A", "lora_B", "lora_E", "lora_embedding_A", "lora_embedding_B") + # other_param_names is defined in LoraLayer + + def __init__(self, base_layer: nn.Module) -> None: + super().__init__(base_layer) + self.lora_E = nn.ParameterDict({}) + self.lora_A = nn.ParameterDict({}) + self.lora_B = nn.ParameterDict({}) + self.ranknum = nn.ParameterDict({}) + + def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights): + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout[adapter_name] = lora_dropout_layer + # Actual trainable parameters + # Right singular vectors + self.lora_A[adapter_name] = nn.Parameter(torch.randn(r, self.in_features)) + # Singular values + self.lora_E[adapter_name] = nn.Parameter(torch.randn(r, 1)) + # Left singular vectors + self.lora_B[adapter_name] = nn.Parameter(torch.randn(self.out_features, r)) + # The current rank + self.ranknum[adapter_name] = nn.Parameter(torch.randn(1), requires_grad=False) + self.ranknum[adapter_name].data.fill_(float(r)) + self.ranknum[adapter_name].requires_grad = False + self.scaling[adapter_name] = lora_alpha if lora_alpha > 0 else float(r) + if init_lora_weights: + self.reset_lora_parameters(adapter_name) + + if hasattr(self.get_base_layer(), "qweight"): + # QuantLinear + self.to(self.get_base_layer().qweight.device) + else: + self.to(self.get_base_layer().weight.device) + self.set_adapter(self.active_adapters) + + def reset_lora_parameters(self, adapter_name): + if adapter_name in self.lora_A.keys(): + nn.init.normal_(self.lora_E[adapter_name], mean=0.0, std=0.02) + nn.init.normal_(self.lora_A[adapter_name], mean=0.0, std=0.02) + nn.init.normal_(self.lora_B[adapter_name], mean=0.0, std=0.02) + + +class SVDLinear(nn.Module, AdaLoraLayer): + # SVD-based adaptation by a dense layer + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, + init_lora_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + AdaLoraLayer.__init__(self, base_layer) + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + base_layer = self.get_base_layer() + if active_adapter in self.lora_A.keys(): + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_A.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def get_delta_weight(self, adapter) -> torch.Tensor: + return ( + transpose(self.lora_B[adapter] @ (self.lora_A[adapter] * self.lora_E[adapter]), self.fan_in_fan_out) + * self.scaling[adapter] + / (self.ranknum[adapter] + 1e-5) + ) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + lora_E = self.lora_E[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + ranknum = self.ranknum[active_adapter] + 1e-5 + + x = x.to(lora_A.dtype) + result += (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "adalora." + rep + + +class RankAllocator: + """ + The RankAllocator for AdaLoraModel. Paper: https://openreview.net/pdf?id=lq62uWRJjiY + + Args: + config ([`AdaLoraConfig`]): The configuration of the AdaLora model. + model: the model that we apply AdaLoRA to. + + """ + + def __init__(self, model, peft_config, adapter_name): + self.peft_config = peft_config + self.adapter_name = adapter_name + self.beta1 = peft_config.beta1 + self.beta2 = peft_config.beta2 + assert self.beta1 > 0 and self.beta1 < 1 + assert self.beta2 > 0 and self.beta2 < 1 + + self.reset_ipt() + self._set_budget_scheduler(model) + + def set_total_step(self, total_step): + self.peft_config.total_step = total_step + + def reset_ipt(self): + self.ipt = {} + self.exp_avg_ipt = {} + self.exp_avg_unc = {} + + def _set_budget_scheduler(self, model): + self.init_bgt = 0 + self.name_set = set() + for n, p in model.named_parameters(): + if f"lora_A.{self.adapter_name}" in n: + self.init_bgt += p.size(0) + self.name_set.add(n.replace("lora_A", "%s")) + self.name_set = sorted(self.name_set) + # The total final rank budget + self.target_bgt = self.peft_config.target_r * len(self.name_set) + + def budget_schedule(self, step: int): + tinit = self.peft_config.tinit + tfinal = self.peft_config.tfinal + total_step = self.peft_config.total_step + # Initial warmup + if step <= tinit: + budget = self.init_bgt + mask_ind = False + # Final fine-tuning + elif step > total_step - tfinal: + budget = self.target_bgt + mask_ind = True + else: + # Budget decreasing with a cubic scheduler + mul_coeff = 1 - (step - tinit) / (total_step - tfinal - tinit) + budget = int((self.init_bgt - self.target_bgt) * (mul_coeff**3) + self.target_bgt) + mask_ind = True if step % self.peft_config.deltaT == 0 else False + return budget, mask_ind + + def update_ipt(self, model): + # Update the sensitivity and uncertainty for every weight + for n, p in model.named_parameters(): + if "lora_" in n and self.adapter_name in n: + if n not in self.ipt: + self.ipt[n] = torch.zeros_like(p) + self.exp_avg_ipt[n] = torch.zeros_like(p) + self.exp_avg_unc[n] = torch.zeros_like(p) + with torch.no_grad(): + self.ipt[n] = (p * p.grad).abs().detach() + # Sensitivity smoothing + self.exp_avg_ipt[n] = self.beta1 * self.exp_avg_ipt[n] + (1 - self.beta1) * self.ipt[n] + # Uncertainty quantification + self.exp_avg_unc[n] = ( + self.beta2 * self.exp_avg_unc[n] + (1 - self.beta2) * (self.ipt[n] - self.exp_avg_ipt[n]).abs() + ) + + def _element_score(self, n): + return self.exp_avg_ipt[n] * self.exp_avg_unc[n] + + def _combine_ipt(self, ipt_E, ipt_AB): + ipt_AB = ipt_AB.sum(dim=1, keepdim=False) + sum_ipt = ipt_E.view(-1) + ipt_AB.view(-1) + return sum_ipt + + def mask_to_budget(self, model, budget): + value_ipt = {} + vector_ipt = {} + triplet_ipt = {} + # Get the importance score for A, E, B + for n, p in model.named_parameters(): + if f"lora_A.{self.adapter_name}" in n: + entry_ipt = self._element_score(n) + comb_ipt = torch.mean(entry_ipt, dim=1, keepdim=True) + name_m = n.replace("lora_A", "%s") + if name_m not in vector_ipt: + vector_ipt[name_m] = [comb_ipt] + else: + vector_ipt[name_m].append(comb_ipt) + if f"lora_B.{self.adapter_name}" in n: + entry_ipt = self._element_score(n) + comb_ipt = torch.mean(entry_ipt, dim=0, keepdim=False).view(-1, 1) + name_m = n.replace("lora_B", "%s") + if name_m not in vector_ipt: + vector_ipt[name_m] = [comb_ipt] + else: + vector_ipt[name_m].append(comb_ipt) + if f"lora_E.{self.adapter_name}" in n: + entry_ipt = self._element_score(n) + name_m = n.replace("lora_E", "%s") + value_ipt[name_m] = entry_ipt + + all_score = [] + # Calculate the score for each triplet + for name_m in vector_ipt: + ipt_E = value_ipt[name_m] + ipt_AB = torch.cat(vector_ipt[name_m], dim=1) + sum_ipt = self._combine_ipt(ipt_E, ipt_AB) + name_E = name_m % "lora_E" + triplet_ipt[name_E] = sum_ipt.view(-1, 1) + all_score.append(sum_ipt.view(-1)) + + # Get the threshold by ranking ipt + mask_threshold = torch.kthvalue( + torch.cat(all_score), + k=self.init_bgt - budget, + )[0].item() + + rank_pattern = {} + # Mask the unimportant triplets + with torch.no_grad(): + for n, p in model.named_parameters(): + if f"lora_E.{self.adapter_name}" in n: + p.masked_fill_(triplet_ipt[n] <= mask_threshold, 0.0) + rank_pattern[n] = (~(triplet_ipt[n] <= mask_threshold)).view(-1).tolist() + return rank_pattern + + def update_and_allocate(self, model, global_step, force_mask=False): + # # Update the importance score and allocate the budget + if global_step < self.peft_config.total_step - self.peft_config.tfinal: + self.update_ipt(model) + budget, mask_ind = self.budget_schedule(global_step) + # Allocate the budget according to importance scores + if mask_ind or force_mask: + rank_pattern = self.mask_to_budget(model, budget) + else: + rank_pattern = None + return budget, rank_pattern + + def mask_using_rank_pattern(self, model, rank_pattern): + # Mask the unimportant triplets + is_adapter_name_truncated = False + if self.adapter_name not in next(iter(rank_pattern.keys())): + is_adapter_name_truncated = True + + with torch.no_grad(): + for n, p in model.named_parameters(): + if f"lora_E.{self.adapter_name}" in n: + key = n if not is_adapter_name_truncated else n.replace(f".{self.adapter_name}", "") + mask = torch.Tensor(rank_pattern[key]).unsqueeze(-1).to(p.device) + p.masked_fill_(~mask.bool(), 0.0) diff --git a/MoRA/peft_mora/tuners/adalora/model.py b/MoRA/peft_mora/tuners/adalora/model.py new file mode 100644 index 0000000000000000000000000000000000000000..7b9abc3e2152deff562d283356d3f8e89df250f4 --- /dev/null +++ b/MoRA/peft_mora/tuners/adalora/model.py @@ -0,0 +1,346 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import torch +from transformers.pytorch_utils import Conv1D + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available +from peft_mora.tuners.lora import LoraConfig, LoraModel +from peft_mora.tuners.tuners_utils import BaseTunerLayer +from peft_mora.utils import ( + TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING, + _freeze_adapter, + _get_submodules, + get_auto_gptq_quant_linear, + get_quantization_config, +) + +from .gptq import SVDQuantLinear +from .layer import AdaLoraLayer, RankAllocator, SVDLinear + + +class AdaLoraModel(LoraModel): + """ + Creates AdaLoRA (Adaptive LoRA) model from a pretrained transformers model. Paper: + https://openreview.net/forum?id=lq62uWRJjiY + + Args: + model ([`transformers.PreTrainedModel`]): The model to be adapted. + config ([`AdaLoraConfig`]): The configuration of the AdaLora model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + + Returns: + `torch.nn.Module`: The AdaLora model. + + Example:: + + >>> from transformers import AutoModelForSeq2SeqLM, LoraConfig >>> from peft import AdaLoraModel, AdaLoraConfig + >>> config = AdaLoraConfig( + peft_type="ADALORA", task_type="SEQ_2_SEQ_LM", r=8, lora_alpha=32, target_modules=["q", "v"], + lora_dropout=0.01, + ) + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") >>> model = AdaLoraModel(model, config, "default") + + **Attributes**: + - **model** ([`transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`AdaLoraConfig`]): The configuration of the AdaLora model. + """ + + # Note: don't redefine prefix here, it should be inherited from LoraModel + + def __init__(self, model, config, adapter_name): + super().__init__(model, config, adapter_name) + + traininable_mode_counter = 0 + for config in self.peft_config.values(): + if not config.inference_mode: + traininable_mode_counter += 1 + + if traininable_mode_counter > 1: + raise ValueError( + "AdaLoraModel supports only 1 trainable adapter. " + "When using multiple adapters, set inference_mode to True for all adapters except the one you want to train." + ) + + if self.peft_config[adapter_name].inference_mode: + _freeze_adapter(self.model, adapter_name) + else: + self.trainable_adapter_name = adapter_name + self.rankallocator = RankAllocator(self.model, self.peft_config[adapter_name], self.trainable_adapter_name) + + def _check_new_adapter_config(self, config: LoraConfig) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + super()._check_new_adapter_config(config) + + traininable_mode_counter = 0 + for config_ in self.peft_config.values(): + if not config_.inference_mode: + traininable_mode_counter += 1 + + if traininable_mode_counter > 1: + raise ValueError( + f"{self.__class__.__name__} supports only 1 trainable adapter. " + "When using multiple adapters, set inference_mode to True for all adapters except the one " + "you want to train." + ) + + def _create_and_replace( + self, + lora_config, + adapter_name, + target, + target_name, + parent, + current_key, + ): + kwargs = { + "r": lora_config.init_r, + "lora_alpha": lora_config.lora_alpha, + "lora_dropout": lora_config.lora_dropout, + "fan_in_fan_out": lora_config.fan_in_fan_out, + "init_lora_weights": lora_config.init_lora_weights, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + if (kwargs["loaded_in_8bit"] or kwargs["loaded_in_4bit"]) and not is_bnb_available(): + raise ImportError( + "To use AdaLora with 8-bit quantization, please install the `bitsandbytes` package. " + "You can install it with `pip install bitsandbytes`." + ) + + quantization_config = get_quantization_config(self.model, method="gptq") + if quantization_config is not None: + kwargs["gptq_quantization_config"] = quantization_config + + # If it is not an AdaLoraLayer, create a new module, else update it with new adapters + if not isinstance(target, AdaLoraLayer): + new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs) + if adapter_name != self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + else: + target.update_layer( + adapter_name, + lora_config.init_r, + lora_config.lora_alpha, + lora_config.lora_dropout, + lora_config.init_lora_weights, + ) + + @staticmethod + def _create_new_module(lora_config, adapter_name, target, **kwargs): + # avoid eager bnb import + if is_bnb_available(): + import bitsandbytes as bnb + + from .bnb import SVDLinear8bitLt + if is_bnb_4bit_available(): + from .bnb import SVDLinear4bit + + gptq_quantization_config = kwargs.get("gptq_quantization_config", None) + AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config) + + loaded_in_8bit = kwargs.pop("loaded_in_8bit", False) + loaded_in_4bit = kwargs.pop("loaded_in_4bit", False) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + kwargs.update( + { + "has_fp16_weights": target_base_layer.state.has_fp16_weights, + "memory_efficient_backward": target_base_layer.state.memory_efficient_backward, + "threshold": target_base_layer.state.threshold, + "index": target_base_layer.index, + } + ) + new_module = SVDLinear8bitLt(target, adapter_name, **kwargs) + elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + new_module = SVDLinear4bit(target, adapter_name, **fourbit_kwargs) + elif AutoGPTQQuantLinear is not None and isinstance(target, AutoGPTQQuantLinear): + new_module = SVDQuantLinear(target, adapter_name, **kwargs) + else: + if isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False + elif isinstance(target_base_layer, Conv1D): + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. " + "Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True + else: + raise ValueError( + f"Target module {target} is not supported. " + f"Currently, only `torch.nn.Linear` and `Conv1D` are supported." + ) + new_module = SVDLinear(target, adapter_name, **kwargs) + + return new_module + + @staticmethod + def _prepare_adapter_config(peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING: + raise ValueError("Please specify `target_modules` in `peft_config`") + peft_config.target_modules = TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING[ + model_config["model_type"] + ] + return peft_config + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.model, name) + + def forward(self, *args, **kwargs): + outputs = self.model.forward(*args, **kwargs) + + if (getattr(outputs, "loss", None) is not None) and isinstance(outputs.loss, torch.Tensor): + # Calculate the orthogonal regularization + orth_reg_weight = self.peft_config[self.trainable_adapter_name].orth_reg_weight + + if orth_reg_weight <= 0: + raise ValueError("orth_reg_weight should be greater than 0. ") + + regu_loss = 0 + num_param = 0 + for n, p in self.model.named_parameters(): + if ("lora_A" in n or "lora_B" in n) and self.trainable_adapter_name in n: + para_cov = p @ p.T if "lora_A" in n else p.T @ p + I = torch.eye(*para_cov.size(), out=torch.empty_like(para_cov)) # noqa: E741 + I.requires_grad = False + num_param += 1 + regu_loss += torch.norm(para_cov - I, p="fro") + if num_param > 0: + regu_loss = regu_loss / num_param + else: + regu_loss = 0 + outputs.loss += orth_reg_weight * regu_loss + return outputs + + def resize_modules_by_rank_pattern(self, rank_pattern, adapter_name): + lora_config = self.peft_config[adapter_name] + for name, rank_idx in rank_pattern.items(): + if isinstance(rank_idx, list): + rank = sum(rank_idx) + elif isinstance(rank_idx, torch.Tensor): + rank_idx = rank_idx.view(-1) + rank = rank_idx.sum().item() + else: + raise ValueError("Unexpected type of rank_idx") + key = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1]) + _, target, _ = _get_submodules(self.model, key) + lora_E_weights = target.lora_E[adapter_name][rank_idx] + lora_A_weights = target.lora_A[adapter_name][rank_idx] + lora_B_weights = target.lora_B[adapter_name][:, rank_idx] + ranknum = target.ranknum[adapter_name] + target.update_layer( + adapter_name, + rank, + lora_config.lora_alpha, + lora_config.lora_dropout, + lora_config.init_lora_weights, + ) + with torch.no_grad(): + if rank > 0: + target.lora_E[adapter_name].copy_(lora_E_weights) + target.lora_A[adapter_name].copy_(lora_A_weights) + target.lora_B[adapter_name].copy_(lora_B_weights) + # The scaling is exactly as the previous + target.ranknum[adapter_name].copy_(ranknum) + + def resize_state_dict_by_rank_pattern(self, rank_pattern, state_dict, adapter_name): + for name, rank_idx in rank_pattern.items(): + rank = sum(rank_idx) + prefix = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1]) + for layer in ["lora_E", "lora_A", "lora_B"]: + key = f"base_model.model.{prefix}.{layer}.{adapter_name}" + if layer != "lora_B": + state_dict[key] = ( + state_dict[key][rank_idx] if rank != state_dict[key].shape[0] else state_dict[key] + ) + else: + state_dict[key] = ( + state_dict[key][:, rank_idx] if rank != state_dict[key].shape[1] else state_dict[key] + ) + return state_dict + + def update_and_allocate(self, global_step): + """ + This method updates Adalora budget and mask. + + This should be called in every training step after `loss.backward()` and before `zero_grad()`. + + `tinit`, `tfinal` and `deltaT` are handled with in the method. + + Args: + global_step (`int`): The current training step, it is used to calculate adalora budget. + + Example: + + ```python + >>> loss = model(**input).loss + >>> loss.backward() + >>> optimizer.step() + >>> model.base_model.update_and_allocate(i_step) + >>> optimizer.zero_grad() + ``` + """ + lora_config = self.peft_config[self.trainable_adapter_name] + # Update the importance score and allocate the budget + if global_step < lora_config.total_step - lora_config.tfinal: + _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step) + if rank_pattern: + lora_config.rank_pattern = rank_pattern + # Finalize the budget allocation + elif global_step == lora_config.total_step - lora_config.tfinal: + _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step, force_mask=True) + # for some reason, this freezes the trainable parameters and nothing gets updates + # self.resize_modules_by_rank_pattern(rank_pattern, self.trainable_adapter_name) + lora_config.rank_pattern = rank_pattern + self.rankallocator.reset_ipt() + # Currently using inefficient way to mask the unimportant weights using the rank pattern + # due to problem mentioned above + elif global_step > lora_config.total_step - lora_config.tfinal: + self.rankallocator.mask_using_rank_pattern(self.model, lora_config.rank_pattern) + # Pass the function and do forward propagation + else: + return None diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__init__.py b/MoRA/peft_mora/tuners/adaption_prompt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4ede9455f70e41740768abe80f3198e78397053f --- /dev/null +++ b/MoRA/peft_mora/tuners/adaption_prompt/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .config import AdaptionPromptConfig +from .layer import AdaptedAttention +from .model import AdaptionPromptModel + + +__all__ = ["AdaptionPromptConfig", "AdaptedAttention", "AdaptionPromptModel"] diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..226e57f11a8915235c8cc10dec920aefed216928 Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..751917461c74e90128a773e67ae73e2f2b18de80 Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/layer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24f787eabc2e23919d0735b5a9f7879b92be0017 Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/layer.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a56e027dc4312fa1e58b3ef41dcbfa33f747cee Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/utils.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53d7ef6c90c4cb39801d9b3890695821f4d6f407 Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/utils.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/adaption_prompt/config.py b/MoRA/peft_mora/tuners/adaption_prompt/config.py new file mode 100644 index 0000000000000000000000000000000000000000..a6aad4c91be7db00980521f6c30a9bd358da038c --- /dev/null +++ b/MoRA/peft_mora/tuners/adaption_prompt/config.py @@ -0,0 +1,73 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import namedtuple +from dataclasses import dataclass, field + +from peft_mora.config import PeftConfig +from peft_mora.utils import PeftType + +from .utils import llama_compute_query_states + + +@dataclass +class AdaptionPromptConfig(PeftConfig): + """Stores the configuration of an [`AdaptionPromptModel`].""" + + target_modules: str = field( + default=None, metadata={"help": "Name of the attention submodules to insert adaption prompts into."} + ) + adapter_len: int = field(default=None, metadata={"help": "Number of adapter tokens to insert"}) + adapter_layers: int = field(default=None, metadata={"help": "Number of adapter layers (from the top)"}) + + def __post_init__(self): + self.peft_type = PeftType.ADAPTION_PROMPT + + @property + def is_adaption_prompt(self) -> bool: + """Return True if this is an adaption prompt config.""" + return True + + +# Contains the config that is specific to a transformers model type. +ModelTypeConfig = namedtuple( + "ModelTypeConfig", ["compute_query_states", "target_modules", "k_proj_layer", "v_proj_layer", "o_proj_layer"] +) + +# Mapping of transformers model types to their specific configuration. +TRANSFORMERS_MODEL_CONFIG = { + "llama": ModelTypeConfig( + compute_query_states=llama_compute_query_states, + target_modules="self_attn", + k_proj_layer="k_proj", + v_proj_layer="v_proj", + o_proj_layer="o_proj", + ), +} + + +def prepare_config( + peft_config: AdaptionPromptConfig, + model, +) -> AdaptionPromptConfig: + """Prepare the config based on the llama model type.""" + if model.config.model_type not in TRANSFORMERS_MODEL_CONFIG: + raise ValueError("Unsupported model type for adaption prompt: '{model.config.model_type}'.") + + model_config = TRANSFORMERS_MODEL_CONFIG[model.config.model_type] + + if peft_config.target_modules is None: + peft_config.target_modules = model_config.target_modules + + return peft_config diff --git a/MoRA/peft_mora/tuners/adaption_prompt/layer.py b/MoRA/peft_mora/tuners/adaption_prompt/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..cdd7895eaae63c2bd1ec180087395d3baf9769bd --- /dev/null +++ b/MoRA/peft_mora/tuners/adaption_prompt/layer.py @@ -0,0 +1,120 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .config import TRANSFORMERS_MODEL_CONFIG + + +class AdaptedAttention(nn.Module): + """This module wraps a LLamaAttention module and injects adaption prompts.""" + + def __init__(self, model_type: str, adapter_len: int, model): + """ + Initialize object. + + Args: + model_type: The transformer model type. This is used to retrieve the right method to + compute query states. + adapter_len: The length of the adaption prompt to insert. + model: The original transformer attention module that is being wrapped. + """ + assert not isinstance(model, AdaptedAttention) + super().__init__() + self.model_type = model_type + self.model = model + self.adapter_len = adapter_len + # Assume all parameters of the attention model we are wrapping are on the same device. + device = next(model.parameters()).device + # Don't think this was specified in the paper, but we follow the official repo which used an Embedding + # which initializes the tokens with standard normal values. + # https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L234 + # (bsz, adapter_len, hidden_size) + target_dtype = ( + model.q_proj.weight.dtype if model.q_proj.weight.dtype not in [torch.int8, torch.uint8] else torch.float32 + ) + self.adaption_prompt = nn.Parameter( + torch.empty(1, adapter_len, self.model.hidden_size, device=device, dtype=target_dtype).normal_() + ) + # Initialize the gate to 0 as this is "zero-init". + self.adaption_gate = nn.Parameter(torch.zeros(1, device=device, dtype=target_dtype)) + + def forward(self, **kwargs): + """ + Forward pass for the adapter which wraps the original LlamaAttention module. + + "Official" paper implementation: + https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L141 + + Args: + kwargs: See the original LlamaAttention module. + """ + if kwargs.get("output_attention", False): + raise NotImplementedError("output_attention is not currently supported.") + + output, _, past_key_value = self.model(**kwargs) + bsz = output.shape[0] + q_len = output.shape[1] + embed_dim = output.shape[2] + k_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].k_proj_layer + v_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].v_proj_layer + o_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].o_proj_layer + + if k_proj_layer == v_proj_layer: + _, key, value = getattr(self.model, k_proj_layer)(self.adaption_prompt).split(embed_dim, dim=2) + else: + key = getattr(self.model, k_proj_layer)(self.adaption_prompt) + value = getattr(self.model, v_proj_layer)(self.adaption_prompt) + # (bsz, num_heads, adapter_len, head_dim) + adapter_k = ( + key.view(1, self.adapter_len, self.model.num_heads, self.model.head_dim) + .repeat(bsz, 1, 1, 1) + .transpose(1, 2) + ) + # (bsz, num_heads, adapter_len, head_dim) + adapter_v = ( + value.view(1, self.adapter_len, self.model.num_heads, self.model.head_dim) + .repeat(bsz, 1, 1, 1) + .transpose(1, 2) + ) + + # Recompute query states. + compute_query_states = TRANSFORMERS_MODEL_CONFIG[self.model_type].compute_query_states + # (bsz, num_heads, q_len, head_dim) + query_states = compute_query_states(model=self.model, **kwargs) + + previous_dtype = query_states.dtype + # (bsz, num_heads, q_len, adapter_len) + scores = torch.matmul(query_states, adapter_k.transpose(2, 3).to(previous_dtype)) / math.sqrt( + self.model.head_dim + ) + # Upcast attention to fp32 + # (bsz, num_heads, q_len, adapter_len) + scores = self.adaption_gate * F.softmax(scores, dim=-1, dtype=torch.float32).to(previous_dtype) + # (bsz, q_len, num_heads * head_dim) + adapter_output = torch.matmul(scores, adapter_v).transpose(1, 2).reshape(bsz, q_len, -1) + # (bsz, q_len, hidden_size) + if o_proj_layer is not None: + adapter_output = getattr(self.model, o_proj_layer)(adapter_output) + + # Add adaption prompt output to original output. + output = output + adapter_output + + # Restore original dtype. + output = output.to(previous_dtype) + return output, None, past_key_value diff --git a/MoRA/peft_mora/tuners/adaption_prompt/model.py b/MoRA/peft_mora/tuners/adaption_prompt/model.py new file mode 100644 index 0000000000000000000000000000000000000000..3b39058be486c1814f5424168db6b04e95ead0fa --- /dev/null +++ b/MoRA/peft_mora/tuners/adaption_prompt/model.py @@ -0,0 +1,161 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List + +import torch.nn as nn + +from peft_mora.utils import _freeze_adapter, _get_submodules + +from .config import AdaptionPromptConfig, prepare_config +from .layer import AdaptedAttention +from .utils import is_adaption_prompt_trainable + + +class AdaptionPromptModel(nn.Module): + """ + Implements adaption prompts as described in https://arxiv.org/pdf/2303.16199.pdf. + + The top L attention modules are replaced with AdaptedAttention modules that wrap the original ones, but insert + trainable prompts with gates (for zero init). + + Notes on the multi-adapter pattern: + - We store the states of different adapters by keeping a dictionary of AdaptedAttention modules indexed by adapter + name. + - Every time we switch adapters, we remove the modules of the currently active adapter from the model, store them + in the dictionary, and replace them with the modules of the new adapter. + - To avoid duplicated and potentially inconsistent state, the currently active adapter is always removed from the + dictionary. + - Disabling the adapter would also result in the modules being removed from the model. + """ + + def __init__(self, model, configs: Dict, adapter_name: str): + super().__init__() + self.model = model + # Store adapter configs by name. + self.peft_config: Dict[str, AdaptionPromptConfig] = {} + # Store lists of the parents of the affected attention modules by adapter name. + # We keep references to the parents so we can swap the adapters in-and-out of the model. + self._parents: Dict[str, List[nn.Module]] = {} + # Store lists of cached AdaptedAttention modules by name. + self._cached_adapters: Dict[str, List] = {} + # The name of the currently active adapter. + self._active_adapter = None + # Whether the adapter is enabled. + self._enabled = True + self.forward = self.model.forward + self.add_adapter(adapter_name, configs[adapter_name]) + self._mark_only_adaption_prompts_as_trainable(self.model) + + def add_adapter(self, adapter_name: str, config: AdaptionPromptConfig) -> None: + """Add an adapter with the given name and config.""" + config = prepare_config(config, self.model) + if adapter_name in self.peft_config: + raise ValueError(f"Adapter with name '{adapter_name}' already exists.") + + parents = [] + for name, _ in self.model.named_modules(): + if name.endswith(config.target_modules): + par, _, _ = _get_submodules(self.model, name) + parents.append(par) + if len(parents) < config.adapter_layers: + raise ValueError( + f"Config specifies more adapter layers '{config.adapter_layers}'" + f" than the model has '{len(parents)}'." + ) + # Note that if the target modules are not in Sequential, ModuleList, or + # some other PyTorch ordered container, the behavior is undefined as we + # assume here that the order of the modules is the same as the order of + # the transformer decoder layers. + parents = parents[-config.adapter_layers :] + self._parents[adapter_name] = parents + + # It is only None during initialization. + # If it is disabled, we don't have to remove the modules. + if self._active_adapter is not None and self._enabled: + self._remove_adapted_attentions(self._active_adapter) + self._active_adapter = adapter_name + self.peft_config[adapter_name] = config + self._create_adapted_attentions(config, parents) + if not self._enabled: + self._remove_adapted_attentions(self._active_adapter) + + if config.inference_mode: + _freeze_adapter(self.model, adapter_name) + + def set_adapter(self, adapter_name: str) -> None: + """Set the model to use the adapter with the given name.""" + if self._active_adapter == adapter_name: + return + if adapter_name not in self.peft_config: + raise ValueError(f"Adapter with name '{adapter_name}' does not exist.") + + if self._enabled: + self._remove_adapted_attentions(self._active_adapter) + self._set_adapted_attentions(adapter_name) + + self._active_adapter = adapter_name + + def enable_adapter_layers(self): + """Enable adapter layers by swapping in cached AdaptedAttention modules.""" + self._enabled = True + self._set_adapted_attentions(self._active_adapter) + + def disable_adapter_layers(self): + """Disable adapter layers by swapping out AdaptedAttention modules.""" + self._enabled = False + self._remove_adapted_attentions(self._active_adapter) + + def _create_adapted_attentions(self, config: AdaptionPromptConfig, parents: List[nn.Module]) -> None: + """Wrap LlamaAttention modules with newly created AdaptedAttention modules.""" + for par in parents: + attn = AdaptedAttention( + model_type=self.model.config.model_type, + adapter_len=config.adapter_len, + model=getattr(par, config.target_modules), + ) + setattr(par, config.target_modules, attn) + + def _set_adapted_attentions(self, adapter_name: str) -> None: + """Replace LlamaAttention modules with cached AdaptedAttention modules.""" + cached = self._cached_adapters[adapter_name] + del self._cached_adapters[adapter_name] + config = self.peft_config[adapter_name] + for i, par in enumerate(self._parents[adapter_name]): + setattr(par, config.target_modules, cached[i]) + + def _remove_adapted_attentions(self, adapter_name: str) -> None: + """Remove AdaptedAttention modules from the model and store them in the cache.""" + config = self.peft_config[adapter_name] + adapted_attentions = [] + for par in self._parents[adapter_name]: + attn = getattr(par, config.target_modules) + adapted_attentions.append(attn) + setattr(par, config.target_modules, attn.model) + self._cached_adapters[adapter_name] = adapted_attentions + + def _mark_only_adaption_prompts_as_trainable(self, model: nn.Module) -> None: + """Freeze all parameters of the model except the adaption prompts.""" + for n, p in model.named_parameters(): + if not is_adaption_prompt_trainable(n): + p.requires_grad = False + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + # This is necessary as e.g. causal models have various methods that we + # don't want to re-implement here. + return getattr(self.model, name) diff --git a/MoRA/peft_mora/tuners/adaption_prompt/utils.py b/MoRA/peft_mora/tuners/adaption_prompt/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..24f33e5584009a04f7d73f5f675d3395f09da215 --- /dev/null +++ b/MoRA/peft_mora/tuners/adaption_prompt/utils.py @@ -0,0 +1,111 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import inspect + +import torch +import torch.nn as nn + + +def llama_rotate_half(x: torch.Tensor) -> torch.Tensor: + """ + Rotate half the hidden dims of the input. + + This function was duplicated verbatim from: + https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L126 + + This was done to eliminate the Llama transformers implementation as a dependency of this file. Note that some other + functions were also adapted from the transformers implementation but were modified. + """ + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def llama_apply_rotary_pos_emb(q, cos, sin, position_ids): + """ + Apply rotary position embedding to query states in the Llama model. + + This function was adapted from: + https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L133 + + It was modified to remove unnecessary processing of key states. The method is compatible with transformers <= + 4.34.2 and also with the latest version (>=4.35). + """ + # In previous transformers version cos/sin cached had a shape of 4D + if len(cos.shape) == 4: + gather_indices = position_ids[:, None, :, None] # [bs, 1, seq_len, 1] + gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3]) + cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices) + sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices) + # In the new version, it is 2D so we fall back to the new implementation + # https://github.com/huggingface/transformers/blame/eef7ea98c31a333bacdc7ae7a2372bde772be8e4/src/transformers/models/llama/modeling_llama.py#L222-L226 + else: + cos = cos[position_ids].unsqueeze(1) + sin = sin[position_ids].unsqueeze(1) + q_embed = (q * cos) + (llama_rotate_half(q) * sin) + return q_embed + + +def llama_compute_query_states(model: nn.Module, **kwargs) -> torch.Tensor: + """ + Compute query states for Llama models specifically. They need to be recomputed as the forward() method of the + original LlamaModel in the transformers library does not return them. See the related discussion in the PR: + https://github.com/huggingface/peft/pull/268 + """ + hidden_states = kwargs.get("hidden_states") + position_ids = kwargs.get("position_ids") + past_key_value = kwargs.get("past_key_value") + bsz, q_len, _ = hidden_states.size() + query_states = model.q_proj(hidden_states).view(bsz, q_len, model.num_heads, model.head_dim).transpose(1, 2) + value_states = model.v_proj(hidden_states).view(bsz, q_len, model.num_heads, model.head_dim).transpose(1, 2) + seq_len = q_len + + if past_key_value is not None: + if isinstance(past_key_value, tuple): + # for transformers <= 4.35 + seq_len += past_key_value[0].shape[-2] + else: + # since transformers 4.36, this is a DynamicCache instance + seq_len += past_key_value.get_seq_length(model.layer_idx) + + # For transformers > 4.37.2 `position_ids` became a required arguments in the rotary embedding's forward pass. + if "position_ids" not in inspect.signature(model.rotary_emb.forward).parameters: + # TODO we assume that position_ids is not None here, not sure if that is safe but the old code also did that + cos, sin = model.rotary_emb(value_states, seq_len=seq_len) + return llama_apply_rotary_pos_emb(query_states, cos, sin, position_ids) + + past_seen_tokens = 0 + if position_ids is None: + # Compute position_ids, since they are required for transformers > 4.37.2 + if past_key_value is None: + new_cache_positions = torch.arange(q_len, q_len + q_len, device=value_states.device) + else: + past_seen_tokens = past_key_value.get_usable_length(q_len, model.layer_idx) + new_cache_positions = torch.arange(past_seen_tokens, past_seen_tokens + q_len, device=value_states.device) + position_ids = new_cache_positions.unsqueeze(0) + + cos, sin = model.rotary_emb(value_states, seq_len=q_len + past_seen_tokens, position_ids=position_ids) + + # For batched inference unsqueeze it on the correct dim + # since: https://github.com/huggingface/transformers/pull/29109 + if len(cos.shape) == 3: + cos = cos.unsqueeze(1) + sin = sin.unsqueeze(1) + + return (query_states * cos) + (llama_rotate_half(query_states) * sin) + + +def is_adaption_prompt_trainable(params: str) -> bool: + """Return True if module is trainable under adaption prompt fine-tuning.""" + return params.split(".")[-1].startswith("adaption_") diff --git a/MoRA/peft_mora/tuners/ia3/__init__.py b/MoRA/peft_mora/tuners/ia3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d54cd89cc1d123eb321f5c6a13f382afe70e4b55 --- /dev/null +++ b/MoRA/peft_mora/tuners/ia3/__init__.py @@ -0,0 +1,36 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available + +from .config import IA3Config +from .layer import Conv2d, IA3Layer, Linear +from .model import IA3Model + + +__all__ = ["Conv2d", "IA3Config", "IA3Layer", "IA3Model", "Linear"] + + +def __getattr__(name): + if (name == "Linear8bitLt") and is_bnb_available(): + from .bnb import Linear8bitLt + + return Linear8bitLt + + if (name == "Linear4bit") and is_bnb_4bit_available(): + from .bnb import Linear4bit + + return Linear4bit + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/MoRA/peft_mora/tuners/ia3/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/ia3/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b7dd99ab2f5f6f5b1f5f7911c7dddd540ec5d61 Binary files /dev/null and b/MoRA/peft_mora/tuners/ia3/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/ia3/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/ia3/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d8f1be8c02054ab6c6f2643ac7a0c6c2cf7190e Binary files /dev/null and b/MoRA/peft_mora/tuners/ia3/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/ia3/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/ia3/__pycache__/layer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45dda2966e23bce643b13ded365e7a1db9a84f88 Binary files /dev/null and b/MoRA/peft_mora/tuners/ia3/__pycache__/layer.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/ia3/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/ia3/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d7bffa489a033e6604d1b8df44d1e5d22a160f6 Binary files /dev/null and b/MoRA/peft_mora/tuners/ia3/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/ia3/bnb.py b/MoRA/peft_mora/tuners/ia3/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..3d033191a27081bc7cd34601f62f53ab8eca2778 --- /dev/null +++ b/MoRA/peft_mora/tuners/ia3/bnb.py @@ -0,0 +1,129 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import torch + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available + +from .layer import IA3Layer + + +if is_bnb_available(): + + class Linear8bitLt(torch.nn.Module, IA3Layer): + # (IA)^3 implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + is_feedforward: bool, + init_ia3_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward) + + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + self._active_adapter = adapter_name + self.update_layer(adapter_name, init_ia3_weights) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + # note: no check for self.merged because merging is not supported (yet) + if self.disable_adapters: + return self.base_layer(x) + + ia3_scaling = 1 + for active_adapter in self.active_adapters: + if active_adapter not in self.ia3_l.keys(): + continue + ia3_scaling *= self.ia3_l[active_adapter].flatten() + + requires_conversion = (not torch.is_autocast_enabled()) and (x.dtype != torch.float32) + if requires_conversion: + x = x.float() + if self.is_feedforward: + result = self.base_layer(x * ia3_scaling) + expected_dtype = result.dtype + else: + result = self.base_layer(x) + expected_dtype = result.dtype + result = result * ia3_scaling + + if requires_conversion: + result = result.to(expected_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "ia3." + rep + + +if is_bnb_4bit_available(): + + class Linear4bit(torch.nn.Module, IA3Layer): + # IA3 implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + is_feedforward: bool, + init_ia3_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward) + + # Freezing the pre-trained weight matrix + self.get_base_layer().weight.requires_grad = False + self._active_adapter = adapter_name + self.update_layer(adapter_name, init_ia3_weights) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + # note: no check for self.merged because merging is not supported (yet) + if self.disable_adapters: + return self.base_layer(x) + + ia3_scaling = 1 + for active_adapter in self.active_adapters: + if active_adapter not in self.ia3_l.keys(): + continue + ia3_scaling *= self.ia3_l[active_adapter].flatten() + + requires_conversion = (not torch.is_autocast_enabled()) and (x.dtype != torch.float32) + if requires_conversion: + x = x.float() + if self.is_feedforward: + result = self.base_layer(x * ia3_scaling) + expected_dtype = result.dtype + else: + result = self.base_layer(x) + expected_dtype = result.dtype + result = result * ia3_scaling + + result = result.clone() + # adalora.py and lora.py both suggest that this is necessary for 4-bit training on older versions of Pytorch. + # This has been duplicated here. + + if requires_conversion: + result = result.to(expected_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "ia3." + rep diff --git a/MoRA/peft_mora/tuners/ia3/config.py b/MoRA/peft_mora/tuners/ia3/config.py new file mode 100644 index 0000000000000000000000000000000000000000..8e52286c05a0eb4befb808c51a7475b714af7847 --- /dev/null +++ b/MoRA/peft_mora/tuners/ia3/config.py @@ -0,0 +1,98 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import List, Optional, Union + +from peft_mora.config import PeftConfig +from peft_mora.utils import PeftType + + +@dataclass +class IA3Config(PeftConfig): + """ + This is the configuration class to store the configuration of a [`IA3Model`]. + + Args: + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, + excluding the output layer. If this is not specified, modules will be chosen according to the model + architecture. If the architecture is not known, an error will be raised -- in this case, you should specify + the target modules manually. + feedforward_modules (`Optional[Union[List[str], str]]`): + The names of the modules to be treated as feedforward modules, as in the original paper. These modules will + have (IA)Β³ vectors multiplied to the input, instead of the output. `feedforward_modules` must be a name or + a subset of names present in `target_modules`. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + modules_to_save (`Optional[List[str]]`): + List of modules apart from (IA)Β³ layers to be set as trainable and saved in the final checkpoint. + init_ia3_weights (`bool`): + Whether to initialize the vectors in the (IA)Β³ layers, defaults to `True`. Setting this to `False` is + discouraged. + """ + + target_modules: Optional[Union[List[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with (IA)Β³." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'." + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + "If not specified, modules will be chosen according to the model architecture, If the architecture is " + "not known, an error will be raised -- in this case, you should specify the target modules manually." + ), + }, + ) + feedforward_modules: Optional[Union[List[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or a regex expression of module names which are feedforward" + "For example, ['output.dense']" + }, + ) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + modules_to_save: Optional[List[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from (IA)^3 layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + init_ia3_weights: bool = field( + default=True, + metadata={"help": "Whether to initialize the vectors in the (IA)^3 layers."}, + ) + + def __post_init__(self): + self.peft_type = PeftType.IA3 + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + self.feedforward_modules = ( + set(self.feedforward_modules) if isinstance(self.feedforward_modules, list) else self.feedforward_modules + ) + + # check if feedforward_modules is a subset of target_modules. run the check only if both are sets + if isinstance(self.feedforward_modules, set) and isinstance(self.target_modules, set): + if not self.feedforward_modules.issubset(self.target_modules): + raise ValueError("`feedforward_modules` should be a subset of `target_modules`") diff --git a/MoRA/peft_mora/tuners/ia3/layer.py b/MoRA/peft_mora/tuners/ia3/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..3668460768bcc703e0f597030450125336c475e6 --- /dev/null +++ b/MoRA/peft_mora/tuners/ia3/layer.py @@ -0,0 +1,307 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import Any, List, Optional + +import torch +import torch.nn as nn +from transformers.pytorch_utils import Conv1D + +from peft_mora.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft_mora.utils import transpose + + +class IA3Layer(BaseTunerLayer): + # All names of layers that may contain adapter weights + adapter_layer_names = ("ia3_l",) + + def __init__(self, base_layer: nn.Module, is_feedforward: bool, **kwargs) -> None: + self.base_layer = base_layer + self.ia3_l = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + self.is_feedforward = is_feedforward + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, nn.Conv2d): + in_features, out_features = base_layer.in_channels, base_layer.out_channels + elif isinstance(base_layer, nn.Embedding): + in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim + elif isinstance(base_layer, Conv1D): + in_features, out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + self.in_features = in_features + self.out_features = out_features + + def update_layer(self, adapter_name, init_ia3_weights): + # This code works for linear layers, override for other layer types + # Actual trainable parameters + if self.is_feedforward: + weight = torch.randn((1, self.in_features)) + else: + weight = torch.randn((self.out_features, 1)) + self.ia3_l[adapter_name] = nn.Parameter(weight) + if init_ia3_weights: + self.reset_ia3_parameters(adapter_name) + self.to(self.get_base_layer().weight.device) + self.set_adapter(self.active_adapters) + + def reset_ia3_parameters(self, adapter_name): + if adapter_name in self.ia3_l.keys(): + # initialize learned vector with torch.ones + nn.init.constant_(self.ia3_l[adapter_name], 1.0) + + +class Linear(nn.Module, IA3Layer): + # (IA)^3 implemented in a dense layer + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_feedforward: bool = False, # Set to True if the layer is treated as a feedforward layer + is_target_conv_1d_layer: bool = False, # whether target module is a conv1d layer. useful while unloading later + init_ia3_weights: bool = True, # whether to initialize IA3 weights + **kwargs, + ) -> None: + super().__init__() + IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward) + self.fan_in_fan_out = fan_in_fan_out + self.is_target_conv_1d_layer = is_target_conv_1d_layer + self._active_adapter = adapter_name + self.update_layer(adapter_name, init_ia3_weights) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.ia3_l.keys(): + base_layer = self.get_base_layer() + ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out) + if safe_merge: + orig_weights = base_layer.weight.data + orig_weights = torch.mul(orig_weights, ia3_l) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + base_layer.weight.data = orig_weights + else: + base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_l) + + if not self.is_feedforward and (base_layer.bias is not None): + scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape) + base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + warnings.warn("Unmerge result can be inaccurate for (IA)^3.") + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.ia3_l.keys(): + base_layer = self.get_base_layer() + # Add tolerace to avoid division by zero + ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out) + 1e-8 + base_layer.weight.data = torch.div(base_layer.weight.data, ia3_l) + + if not self.is_feedforward and (base_layer.bias is not None): + scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape) + base_layer.bias.data = torch.div(base_layer.bias.data, scaling.data + 1e-8) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + dtype = previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + ia3_scaling = 1 + for active_adapter in self.active_adapters: + if active_adapter not in self.ia3_l.keys(): + continue + dtype = self.ia3_l[active_adapter].dtype + ia3_scaling *= self.ia3_l[active_adapter].flatten() + + if self.is_feedforward: + x = x.to(dtype) + # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype + # e.g. bf16 vs fp32. Is that okay? + interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype) + result = self.base_layer(interm, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + result = result.to(dtype) * ia3_scaling + + result = result.to(previous_dtype) + return result + + +class Conv2d(nn.Module, IA3Layer): + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_feedforward: bool = False, # Set to True if the layer is treated as a feedforward layer + init_ia3_weights: bool = True, + **kwargs, + ) -> None: + super().__init__() + IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward) + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + + self.update_layer(adapter_name, init_ia3_weights) + + def update_layer(self, adapter_name, init_ia3_weights): + # Actual trainable parameters + if self.is_feedforward: + weight = torch.randn((1, self.in_features, 1, 1)) + else: + weight = torch.randn((1, self.out_features, 1, 1)) + self.ia3_l[adapter_name] = nn.Parameter(weight) + if init_ia3_weights: + self.reset_ia3_parameters(adapter_name) + self.to(self.get_base_layer().weight.device) + self.set_adapter(self.active_adapters) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.ia3_l.keys(): + base_layer = self.get_base_layer() + ia3_scaling = self.ia3_l[active_adapter].data + if not self.is_feedforward: + ia3_scaling = ia3_scaling.permute(1, 0, 2, 3) + + if safe_merge: + output_weight = torch.mul(base_layer.weight.data, ia3_scaling).clone() + + if not torch.isfinite(output_weight).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = output_weight + else: + base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_scaling) + + if not self.is_feedforward and (base_layer.bias is not None): + scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape) + base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data) + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + warnings.warn("Unmerge result can be inaccurate for (IA)^3.") + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.ia3_l.keys(): + base_layer = self.get_base_layer() + # divide by (IA)^3 vector. Add tolerace to avoid division by zero + ia3_scaling = self.ia3_l[active_adapter].data + if not self.is_feedforward: + ia3_scaling = ia3_scaling.permute(1, 0, 2, 3) + base_layer.weight.data = torch.div(base_layer.weight.data, ia3_scaling + 1e-8) + + if not self.is_feedforward and (base_layer.bias is not None): + scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape) + base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + dtype = previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + ia3_scaling = 1 + for active_adapter in self.active_adapters: + if active_adapter not in self.ia3_l.keys(): + continue + dtype = self.ia3_l[active_adapter].dtype + ia3_scaling *= self.ia3_l[active_adapter] + + if self.is_feedforward: + x = x.to(dtype) + # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype + # e.g. bf16 vs fp32. Is that okay? + interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype) + result = self.base_layer(interm, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + result = result.to(dtype) * ia3_scaling + + result = result.to(previous_dtype) + return result diff --git a/MoRA/peft_mora/tuners/ia3/model.py b/MoRA/peft_mora/tuners/ia3/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c5d46cf6d24f3fb92da8fe45c61d2395a36c51f6 --- /dev/null +++ b/MoRA/peft_mora/tuners/ia3/model.py @@ -0,0 +1,394 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import re +import warnings +from dataclasses import asdict +from enum import Enum +from typing import Optional + +import torch +from torch import nn +from transformers.pytorch_utils import Conv1D + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available +from peft_mora.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists +from peft_mora.utils import ( + TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING, + ModulesToSaveWrapper, + _get_submodules, +) + +from .layer import Conv2d, IA3Layer, Linear + + +class IA3Model(BaseTuner): + """ + Creates a Infused Adapter by Inhibiting and Amplifying Inner Activations ((IA)^3) model from a pretrained + transformers model. The method is described in detail in https://arxiv.org/abs/2205.05638 + + Args: + model ([`~transformers.PreTrainedModel`]): The model to be adapted. + config ([`IA3Config`]): The configuration of the (IA)^3 model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + + Returns: + `torch.nn.Module`: The (IA)^3 model. + + Example: + + ```py + >>> from transformers import AutoModelForSeq2SeqLM, ia3Config + >>> from peft import IA3Model, IA3Config + + >>> config = IA3Config( + ... peft_type="IA3", + ... task_type="SEQ_2_SEQ_LM", + ... target_modules=["k", "v", "w0"], + ... feedforward_modules=["w0"], + ... ) + + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> ia3_model = IA3Model(config, model) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`ia3Config`]): The configuration of the (IA)^3 model. + """ + + prefix: str = "ia3_" + + def __init__(self, model, config, adapter_name): + super().__init__(model, config, adapter_name) + + @staticmethod + def _create_new_module(ia3_config, adapter_name, target, **kwargs): + # avoid eager bnb import + if is_bnb_available(): + import bitsandbytes as bnb + + from .bnb import Linear8bitLt + + if is_bnb_4bit_available(): + from .bnb import Linear4bit + + loaded_in_8bit = kwargs.pop("loaded_in_8bit", False) + loaded_in_4bit = kwargs.pop("loaded_in_4bit", False) + is_feedforward = kwargs.pop("is_feedforward", False) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + eightbit_kwargs = kwargs.copy() + eightbit_kwargs.update( + { + "has_fp16_weights": target_base_layer.state.has_fp16_weights, + "memory_efficient_backward": target_base_layer.state.memory_efficient_backward, + "threshold": target_base_layer.state.threshold, + "index": target_base_layer.index, + } + ) + new_module = Linear8bitLt(target, adapter_name, is_feedforward=is_feedforward, **eightbit_kwargs) + elif loaded_in_4bit and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + new_module = Linear4bit(target, adapter_name, is_feedforward=is_feedforward, **fourbit_kwargs) + elif isinstance(target, torch.nn.Conv2d): + new_module = Conv2d(target, adapter_name, is_feedforward=is_feedforward, **kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = False + new_module = Linear(target, adapter_name, is_feedforward=is_feedforward, **kwargs) + elif isinstance(target_base_layer, Conv1D): + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. " + "Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = True + new_module = Linear( + target, adapter_name, is_feedforward=is_feedforward, is_target_conv_1d_layer=True, **kwargs + ) + else: + raise ValueError( + f"Target module {target} is not supported. " + f"Currently, only `torch.nn.Linear`, `torch.nn.Conv2d`, and `Conv1D` are supported." + ) + return new_module + + @staticmethod + def _check_target_module_exists(ia3_config, key): + return check_target_module_exists(ia3_config, key) + + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + for n, p in model.named_parameters(): + if self.prefix not in n: + p.requires_grad = False + + def _create_and_replace( + self, + ia3_config, + adapter_name, + target, + target_name, + parent, + current_key, + ): + # check if target module is in feedforward_modules + is_feedforward = self._check_target_module_feedforward(ia3_config, current_key) + + kwargs = { + "fan_in_fan_out": ia3_config.fan_in_fan_out, + "init_ia3_weights": ia3_config.init_ia3_weights, + "is_feedforward": is_feedforward, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + + if isinstance(target, IA3Layer): + target.update_layer( + adapter_name, + ia3_config.init_ia3_weights, + ) + else: + new_module = self._create_new_module(ia3_config, adapter_name, target, **kwargs) + if adapter_name != self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + @staticmethod + def _check_target_module_feedforward(ia3_config, key) -> bool: + """ + A helper private method that checks if the target module `key` matches with a feedforward module specified in + `ia3_config` + """ + if isinstance(ia3_config.feedforward_modules, str): + is_feedforward = bool(re.fullmatch(ia3_config.feedforward_modules, key)) + else: + is_feedforward = any(key.endswith(target_key) for target_key in ia3_config.feedforward_modules) + return is_feedforward + + def _replace_module(self, parent, child_name, new_module, child): + setattr(parent, child_name, new_module) + + # child layer wraps the original module, unpack it + if hasattr(child, "base_layer"): + child = child.base_layer + + # layers with base_layer don't need the weight to be copied, as they have a reference already + if not hasattr(new_module, "base_layer"): + new_module.weight = child.weight + if hasattr(child, "bias"): + new_module.bias = child.bias + + if getattr(child, "state", None) is not None: + if hasattr(new_module, "base_layer"): + new_module.base_layer.state = child.state + else: + new_module.state = child.state + new_module.to(child.weight.device) + + # dispatch to correct device + for name, module in new_module.named_modules(): + if self.prefix in name: + module.to(child.weight.device) + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.model, name) + + def get_peft_config_as_dict(self, inference: bool = False): + config_dict = {} + for key, value in self.peft_config.items(): + config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} + if inference: + config["inference_mode"] = True + config_dict[key] = config + return config + + def _set_adapter_layers(self, enabled=True): + for module in self.model.modules(): + if isinstance(module, (IA3Layer, ModulesToSaveWrapper)): + module.enable_adapters(enabled) + + def enable_adapter_layers(self) -> None: + """Enable all adapters. + + Call this if you have previously disabled all adapters and want to re-enable them. + """ + self._set_adapter_layers(enabled=True) + + def disable_adapter_layers(self) -> None: + """Disable all adapters. + + When disabling all adapters, the model output corresponds to the output of the base model. + """ + self._set_adapter_layers(enabled=False) + + def set_adapter(self, adapter_name: str | list[str]) -> None: + """Set the active adapter(s). + + Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated. + """ + for module in self.model.modules(): + if isinstance(module, IA3Layer): + if module.merged: + warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") + module.unmerge() + module.set_adapter(adapter_name) + + def _prepare_adapter_config(self, peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING: + raise ValueError("Please specify `target_modules` in `peft_config`") + peft_config.target_modules = TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING[model_config["model_type"]] + if peft_config.feedforward_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING: + raise ValueError("Please specify `feedforward_modules` in `peft_config`") + peft_config.feedforward_modules = TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING[ + model_config["model_type"] + ] + return peft_config + + def _unload_and_optionally_merge( + self, merge: bool = True, safe_merge: bool = False, adapter_names: Optional[list[str]] = None + ): + r""" + This method merges the (IA)^3 layers into the base model. This is needed if someone wants to use the base model + as a standalone model. + + Args: + safe_merge (`bool`, `optional`, defaults to `False`): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + if getattr(self.model, "is_loaded_in_8bit", False): + raise ValueError("Cannot merge ia3 layers when the model is loaded in 8-bit mode") + + if getattr(self.model, "is_loaded_in_4bit", False): + raise ValueError("Cannot merge ia3 layers when the model is loaded in 4-bit mode") + + self._unloading_checks(adapter_names) + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + for key in key_list: + try: + parent, target, target_name = _get_submodules(self.model, key) + except AttributeError: + continue + + if hasattr(target, "base_layer"): + if merge: + target.merge(safe_merge=safe_merge, adapter_names=adapter_names) + self._replace_module(parent, target_name, target.get_base_layer(), target) + elif isinstance(target, ModulesToSaveWrapper): + # save any additional trainable modules part of `modules_to_save` + new_module = target.modules_to_save[target.active_adapter] + if hasattr(new_module, "base_layer"): + # check if the module is itself a tuner layer + if merge: + new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names) + new_module = new_module.get_base_layer() + setattr(parent, target_name, new_module) + + return self.model + + def merge_and_unload(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> torch.nn.Module: + r""" + This method merges the IAΒ³ layers into the base model. This is needed if someone wants to use the base model as + a standalone model. + + Args: + safe_merge (`bool`): + whether to activate the safe merging check to check if there is any potential Nan in the adapter + weights + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import PeftModel + + >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b") + >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample" + >>> model = PeftModel.from_pretrained(base_model, peft_model_id) + >>> merged_model = model.merge_and_unload() + ``` + """ + return self._unload_and_optionally_merge(safe_merge=safe_merge, adapter_names=adapter_names) + + def unload(self) -> torch.nn.Module: + """ + Gets back the base model by removing all the IAΒ³ modules without merging. This gives back the original base + model. + """ + return self._unload_and_optionally_merge(merge=False) + + def delete_adapter(self, adapter_name: str) -> None: + """ + Deletes an existing adapter. + + Args: + adapter_name (str): Name of the adapter to be deleted. + """ + if adapter_name not in self.peft_config: + raise ValueError(f"Adapter {adapter_name} does not exist") + del self.peft_config[adapter_name] + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + new_adapter = None + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, IA3Layer): + target.delete_adapter(adapter_name) + if new_adapter is None: + new_adapter = target.active_adapters[:] + + self.active_adapter = new_adapter or [] diff --git a/MoRA/peft_mora/tuners/loha/__init__.py b/MoRA/peft_mora/tuners/loha/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2f39deee17ab9cb0e24b3a98d8b54eb7eeb27c1f --- /dev/null +++ b/MoRA/peft_mora/tuners/loha/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import LoHaConfig +from .layer import Conv2d, Linear, LoHaLayer +from .model import LoHaModel + + +__all__ = ["LoHaConfig", "LoHaModel", "Conv2d", "Linear", "LoHaLayer"] diff --git a/MoRA/peft_mora/tuners/loha/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/loha/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a473c8c1816b811b15cc6c0d8c8ab231eddaa6ce Binary files /dev/null and b/MoRA/peft_mora/tuners/loha/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/loha/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/loha/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9993e6eb22f188f86101fc36ca369c4b4b499a77 Binary files /dev/null and b/MoRA/peft_mora/tuners/loha/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/loha/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/loha/__pycache__/layer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1eb9cf066964c717c21b288f1098b70cc05fa104 Binary files /dev/null and b/MoRA/peft_mora/tuners/loha/__pycache__/layer.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/loha/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/loha/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45ead36837177dbcd203c78b239dfea8b123f060 Binary files /dev/null and b/MoRA/peft_mora/tuners/loha/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/loha/config.py b/MoRA/peft_mora/tuners/loha/config.py new file mode 100644 index 0000000000000000000000000000000000000000..9450d4735122a462f71e898705aa4ac1ab303fe9 --- /dev/null +++ b/MoRA/peft_mora/tuners/loha/config.py @@ -0,0 +1,121 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import List, Optional, Union + +from peft_mora.tuners.lycoris_utils import LycorisConfig +from peft_mora.utils import PeftType + + +@dataclass +class LoHaConfig(LycorisConfig): + """ + This is the configuration class to store the configuration of a [`LoHaModel`]. + + Args: + r (`int`): + LoHa rank. + alpha (`int`): + The alpha parameter for LoHa scaling. + rank_dropout (`float`): + The dropout probability for rank dimension during training. + module_dropout (`float`): + The dropout probability for disabling LoHa modules during training. + use_effective_conv2d (`bool`): + Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper). + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, + excluding the output layer. If this is not specified, modules will be chosen according to the model + architecture. If the architecture is not known, an error will be raised -- in this case, you should specify + the target modules manually. + init_weights (`bool`): + Whether to perform initialization of adapter weights. This defaults to `True`, passing `False` is + discouraged. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`str`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. + rank_pattern (`dict`): + The mapping from layer names or regexp expression to ranks which are different from the default rank + specified by `r`. + alpha_pattern (`dict`): + The mapping from layer names or regexp expression to alphas which are different from the default alpha + specified by `alpha`. + modules_to_save (`Optional[List[str]]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + """ + + r: int = field(default=8, metadata={"help": "LoHa rank"}) + alpha: int = field(default=8, metadata={"help": "LoHa alpha"}) + rank_dropout: float = field( + default=0.0, metadata={"help": "The dropout probability for rank dimension during training"} + ) + module_dropout: float = field( + default=0.0, metadata={"help": "The dropout probability for disabling LoHa modules during training"} + ) + use_effective_conv2d: bool = field( + default=False, + metadata={ + "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)' + }, + ) + target_modules: Optional[Union[List[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with LoHa." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the LoHa layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[List[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[str] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." + }, + ) + modules_to_save: Optional[List[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from LoHA layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + + def __post_init__(self): + self.peft_type = PeftType.LOHA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) diff --git a/MoRA/peft_mora/tuners/loha/layer.py b/MoRA/peft_mora/tuners/loha/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..5b782f9dea544513b6a7e10b37a2061f223060d2 --- /dev/null +++ b/MoRA/peft_mora/tuners/loha/layer.py @@ -0,0 +1,375 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Any, Set, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from peft_mora.tuners.lycoris_utils import LycorisLayer + + +class LoHaLayer(nn.Module, LycorisLayer): + # All names of layers that may contain adapter weights + adapter_layer_names = ("hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2") + # other_param_names is defined on parent class + + def __init__(self, base_layer: nn.Module): + super().__init__() + LycorisLayer.__init__(self, base_layer) + + # LoHa info + self.hada_w1_a = nn.ParameterDict({}) + self.hada_w1_b = nn.ParameterDict({}) + self.hada_w2_a = nn.ParameterDict({}) + self.hada_w2_b = nn.ParameterDict({}) + self.hada_t1 = nn.ParameterDict({}) + self.hada_t2 = nn.ParameterDict({}) + + @property + def _available_adapters(self) -> Set[str]: + return {*self.hada_w1_a, *self.hada_w1_b, *self.hada_w2_a, *self.hada_w2_b, *self.hada_t1, *self.hada_t2} + + def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...]): + # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L130C9-L143C75 + if len(shape) == 4: + self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3])) + self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode + self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode + + self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3])) + self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0])) # out_dim, 1-mode + self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) # in_dim , 2-mode + else: + self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r)) + self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) + + self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r)) + self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1])) + + def reset_adapter_parameters(self, adapter_name: str): + # Original implementation performs initialization with normal distribution + # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158 + + # FedPara paper proposes to perform He initialization, let's stick with it + # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization + if adapter_name in self.hada_w1_a.keys(): + nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5)) + nn.init.zeros_(self.hada_w2_b[adapter_name]) + if adapter_name in self.hada_t1.keys(): + nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5)) + + def reset_adapter_parameters_random(self, adapter_name: str): + # Original implementation performs initialization with normal distribution + # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158 + + # FedPara paper proposes to perform He initialization, let's stick with it + # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization + if adapter_name in self.hada_w1_a.keys(): + nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_w2_b[adapter_name], a=math.sqrt(5)) + if adapter_name in self.hada_t1.keys(): + nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5)) + + def update_layer( + self, + adapter_name: str, + r: int, + alpha: float, + rank_dropout: float, + module_dropout: float, + init_weights: bool, + use_effective_conv2d: bool = False, + **kwargs, + ) -> None: + """Internal function to create loha adapter + + Args: + adapter_name (`str`): Name for the adapter to add. + r (`int`): Rank for the added adapter. + alpha (`float`): Alpha for the added adapter. + rank_dropout (`float`): The dropout probability for rank dimension during training. + module_dropout (`float`): The dropout probability for disabling adapter during training. + init_weights (`bool`): Whether to initialize weights. + use_effective_conv2d (`bool`, *optional*, defaults to `False`): + Use parameter effective decomposition for Conv2d with ksize > 1. + """ + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.alpha[adapter_name] = alpha + self.scaling[adapter_name] = alpha / r + self.rank_dropout[adapter_name] = rank_dropout + self.module_dropout[adapter_name] = module_dropout + + # Determine shape of LoHa weights + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + shape = tuple(base_layer.weight.shape) + elif isinstance(base_layer, nn.Conv2d): + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1) + if use_effective_conv2d: + shape = (base_layer.out_channels, base_layer.in_channels, *base_layer.kernel_size) + else: + shape = ( + base_layer.out_channels, + base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1], + ) + else: + raise TypeError(f"LoHa is not implemented for base layers of type {type(base_layer).__name__}") + + # Create weights with provided shape + self.create_adapter_parameters(adapter_name, r, shape) + + # Initialize weights + if init_weights: + self.reset_adapter_parameters(adapter_name) + else: + self.reset_adapter_parameters_random(adapter_name) + + # Move new weights to device + weight = getattr(self.get_base_layer(), "weight", None) + if weight is not None: + # the layer is already completely initialized, this is an update + if weight.dtype.is_floating_point or weight.dtype.is_complex: + self.to(weight.device, dtype=weight.dtype) + else: + self.to(weight.device) + self.set_adapter(self.active_adapters) + + def get_delta_weight(self, adapter_name: str) -> torch.Tensor: + # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L178 + if adapter_name in self.hada_t1.keys(): + weight = make_weight_cp( + self.hada_t1[adapter_name], + self.hada_w1_a[adapter_name], + self.hada_w1_b[adapter_name], + self.hada_t2[adapter_name], + self.hada_w2_a[adapter_name], + self.hada_w2_b[adapter_name], + scale=torch.tensor(self.scaling[adapter_name]), + ) + else: + weight = make_weight( + self.hada_w1_a[adapter_name], + self.hada_w1_b[adapter_name], + self.hada_w2_a[adapter_name], + self.hada_w2_b[adapter_name], + scale=torch.tensor(self.scaling[adapter_name]), + ) + + base_layer = self.get_base_layer() + weight = weight.reshape(base_layer.weight.shape) + + # Perform rank dropout during training - drop rows of addition weights + rank_dropout = self.rank_dropout[adapter_name] + if self.training and rank_dropout: + drop = (torch.rand(weight.size(0)) > rank_dropout).to(weight.dtype) + drop = drop.view(-1, *[1] * len(weight.shape[1:])).to(weight.device) + # TODO: Investigate if there should be a scaler like in normal dropout during training + # Original implementation doesn't have it + # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L193 + drop /= drop.mean() + weight *= drop + + return weight + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + + # Execute all the adapters + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + module_dropout = self.module_dropout[active_adapter] + + # Modify current execution weights + if (not self.training) or (self.training and torch.rand(1) > module_dropout): + result = result + self._get_delta_activations(active_adapter, x, *args, **kwargs) + + result = result.to(previous_dtype) + return result + + +class Linear(LoHaLayer): + """LoHa implemented in Linear layer""" + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + # don't add bias here, because the bias is already included in the output of the base_layer + return F.linear(input, delta_weight) + + def __repr__(self) -> str: + rep = super().__repr__() + return "loha." + rep + + +class Conv2d(LoHaLayer): + """LoHa implemented in Conv2d layer""" + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + use_effective_conv2d: bool = False, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer( + adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs + ) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + # don't add bias here, because the bias is already included in the output of the base_layer + base_layer = self.get_base_layer() + return F.conv2d( + input, + delta_weight, + stride=base_layer.stride, + padding=base_layer.padding, + dilation=base_layer.dilation, + groups=base_layer.groups, + ) + + def __repr__(self) -> str: + rep = super().__repr__() + return "loha." + rep + + +# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L9 + + +class HadaWeight(torch.autograd.Function): + @staticmethod + def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)): + ctx.save_for_backward(w1a, w1b, w2a, w2b, scale) + diff_weight = ((w1a @ w1b) * (w2a @ w2b)) * scale + return diff_weight + + @staticmethod + def backward(ctx, grad_out): + (w1a, w1b, w2a, w2b, scale) = ctx.saved_tensors + grad_out = grad_out * scale + temp = grad_out * (w2a @ w2b) + grad_w1a = temp @ w1b.T + grad_w1b = w1a.T @ temp + + temp = grad_out * (w1a @ w1b) + grad_w2a = temp @ w2b.T + grad_w2b = w2a.T @ temp + + del temp + return grad_w1a, grad_w1b, grad_w2a, grad_w2b, None + + +class HadaWeightCP(torch.autograd.Function): + @staticmethod + def forward(ctx, t1, w1a, w1b, t2, w2a, w2b, scale=torch.tensor(1)): + ctx.save_for_backward(t1, w1a, w1b, t2, w2a, w2b, scale) + + rebuild1 = torch.einsum("i j k l, j r, i p -> p r k l", t1, w1b, w1a) + rebuild2 = torch.einsum("i j k l, j r, i p -> p r k l", t2, w2b, w2a) + + return rebuild1 * rebuild2 * scale + + @staticmethod + def backward(ctx, grad_out): + (t1, w1a, w1b, t2, w2a, w2b, scale) = ctx.saved_tensors + grad_out = grad_out * scale + + temp = torch.einsum("i j k l, j r -> i r k l", t2, w2b) + rebuild = torch.einsum("i j k l, i r -> r j k l", temp, w2a) + + grad_w = rebuild * grad_out + del rebuild + + grad_w1a = torch.einsum("r j k l, i j k l -> r i", temp, grad_w) + grad_temp = torch.einsum("i j k l, i r -> r j k l", grad_w, w1a.T) + del grad_w, temp + + grad_w1b = torch.einsum("i r k l, i j k l -> r j", t1, grad_temp) + grad_t1 = torch.einsum("i j k l, j r -> i r k l", grad_temp, w1b.T) + del grad_temp + + temp = torch.einsum("i j k l, j r -> i r k l", t1, w1b) + rebuild = torch.einsum("i j k l, i r -> r j k l", temp, w1a) + + grad_w = rebuild * grad_out + del rebuild + + grad_w2a = torch.einsum("r j k l, i j k l -> r i", temp, grad_w) + grad_temp = torch.einsum("i j k l, i r -> r j k l", grad_w, w2a.T) + del grad_w, temp + + grad_w2b = torch.einsum("i r k l, i j k l -> r j", t2, grad_temp) + grad_t2 = torch.einsum("i j k l, j r -> i r k l", grad_temp, w2b.T) + del grad_temp + return grad_t1, grad_w1a, grad_w1b, grad_t2, grad_w2a, grad_w2b, None + + +def make_weight(w1a, w1b, w2a, w2b, scale): + return HadaWeight.apply(w1a, w1b, w2a, w2b, scale) + + +def make_weight_cp(t1, w1a, w1b, t2, w2a, w2b, scale): + return HadaWeightCP.apply(t1, w1a, w1b, t2, w2a, w2b, scale) diff --git a/MoRA/peft_mora/tuners/loha/model.py b/MoRA/peft_mora/tuners/loha/model.py new file mode 100644 index 0000000000000000000000000000000000000000..25315a337a222b703017f329ac0dda7584ab8edd --- /dev/null +++ b/MoRA/peft_mora/tuners/loha/model.py @@ -0,0 +1,114 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from itertools import chain +from typing import Dict, Type, Union + +import torch +from torch import nn + +from peft_mora.tuners.lycoris_utils import LycorisConfig, LycorisTuner + +from .layer import Conv2d, Linear, LoHaLayer + + +class LoHaModel(LycorisTuner): + """ + Creates Low-Rank Hadamard Product model from a pretrained model. The method is partially described in + https://arxiv.org/abs/2108.06098 Current implementation heavily borrows from + https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py + + Args: + model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached. + config ([`LoHaConfig`]): The configuration of the LoHa model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + + Returns: + `torch.nn.Module`: The LoHa model. + + Example: + ```py + >>> from diffusers import StableDiffusionPipeline + >>> from peft import LoHaModel, LoHaConfig + + >>> config_te = LoHaConfig( + ... r=8, + ... lora_alpha=32, + ... target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + ... rank_dropout=0.0, + ... module_dropout=0.0, + ... init_weights=True, + ... ) + >>> config_unet = LoHaConfig( + ... r=8, + ... lora_alpha=32, + ... target_modules=[ + ... "proj_in", + ... "proj_out", + ... "to_k", + ... "to_q", + ... "to_v", + ... "to_out.0", + ... "ff.net.0.proj", + ... "ff.net.2", + ... ], + ... rank_dropout=0.0, + ... module_dropout=0.0, + ... init_weights=True, + ... use_effective_conv2d=True, + ... ) + + >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> model.text_encoder = LoHaModel(model.text_encoder, config_te, "default") + >>> model.unet = LoHaModel(model.unet, config_unet, "default") + ``` + + **Attributes**: + - **model** ([`~torch.nn.Module`]) -- The model to be adapted. + - **peft_config** ([`LoHaConfig`]): The configuration of the LoHa model. + """ + + prefix: str = "hada_" + layers_mapping: Dict[Type[torch.nn.Module], Type[LoHaLayer]] = { + torch.nn.Conv2d: Conv2d, + torch.nn.Linear: Linear, + } + + def _create_and_replace( + self, + config: LycorisConfig, + adapter_name: str, + target: Union[LoHaLayer, nn.Module], + target_name: str, + parent: nn.Module, + current_key: str, + ) -> None: + """ + A private method to create and replace the target module with the adapter module. + """ + + # Regexp matching - Find key which matches current target_name in patterns provided + pattern_keys = list(chain(config.rank_pattern.keys(), config.alpha_pattern.keys())) + target_name_key = next(filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys), target_name) + + kwargs = config.to_dict() + kwargs["r"] = config.rank_pattern.get(target_name_key, config.r) + kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha) + + if isinstance(target, LoHaLayer): + target.update_layer(adapter_name, **kwargs) + else: + new_module = self._create_new_module(config, adapter_name, target, **kwargs) + self._replace_module(parent, target_name, new_module, target) diff --git a/MoRA/peft_mora/tuners/lokr/__init__.py b/MoRA/peft_mora/tuners/lokr/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..408cf2a54ae4c0befa9e3f1cad4ff93d71cfedc5 --- /dev/null +++ b/MoRA/peft_mora/tuners/lokr/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import LoKrConfig +from .layer import Conv2d, Linear, LoKrLayer +from .model import LoKrModel + + +__all__ = ["LoKrConfig", "LoKrModel", "Conv2d", "Linear", "LoKrLayer"] diff --git a/MoRA/peft_mora/tuners/lokr/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/lokr/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b00d76dc4698178e3a4696eeec41816f8fc6c3b Binary files /dev/null and b/MoRA/peft_mora/tuners/lokr/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lokr/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/lokr/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..093a1b7eed661c5f54a0ca30214f9e76df468fb2 Binary files /dev/null and b/MoRA/peft_mora/tuners/lokr/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lokr/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/lokr/__pycache__/layer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87d07bd1c43879ed27bcbf76ee2c927b7a5943c9 Binary files /dev/null and b/MoRA/peft_mora/tuners/lokr/__pycache__/layer.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lokr/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/lokr/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b9f40b1916776cc459f1c7efd38ca1f5ad7c34e Binary files /dev/null and b/MoRA/peft_mora/tuners/lokr/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lokr/config.py b/MoRA/peft_mora/tuners/lokr/config.py new file mode 100644 index 0000000000000000000000000000000000000000..43ed67529b2f17d289d07908ce66230ccd1b5522 --- /dev/null +++ b/MoRA/peft_mora/tuners/lokr/config.py @@ -0,0 +1,127 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import List, Optional, Union + +from peft_mora.tuners.lycoris_utils import LycorisConfig +from peft_mora.utils import PeftType + + +@dataclass +class LoKrConfig(LycorisConfig): + """ + Configuration class of [`LoKrModel`]. + + Args: + r (`int`): + LoKr rank. + alpha (`int`): + The alpha parameter for LoKr scaling. + rank_dropout (`float`): + The dropout probability for rank dimension during training. + module_dropout (`float`): + The dropout probability for disabling LoKr modules during training. + use_effective_conv2d (`bool`): + Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper). + decompose_both (`bool`): + Perform rank decomposition of left kronecker product matrix. + decompose_factor (`int`): + Kronecker product decomposition factor. + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, + excluding the output layer. If this is not specified, modules will be chosen according to the model + architecture. If the architecture is not known, an error will be raised -- in this case, you should specify + the target modules manually. + init_weights (`bool`): + Whether to perform initialization of adapter weights. This defaults to `True`, passing `False` is + discouraged. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`str`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. + rank_pattern (`dict`): + The mapping from layer names or regexp expression to ranks which are different from the default rank + specified by `r`. + alpha_pattern (`dict`): + The mapping from layer names or regexp expression to alphas which are different from the default alpha + specified by `alpha`. + modules_to_save (`Optional[List[str]]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + """ + + r: int = field(default=8, metadata={"help": "LoKr rank"}) + alpha: int = field(default=8, metadata={"help": "LoKr alpha"}) + rank_dropout: float = field( + default=0.0, metadata={"help": "The dropout probability for rank dimension during training"} + ) + module_dropout: float = field( + default=0.0, metadata={"help": "The dropout probability for disabling LoKr modules during training"} + ) + use_effective_conv2d: bool = field( + default=False, + metadata={ + "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)' + }, + ) + decompose_both: bool = field( + default=False, + metadata={"help": "Perform rank decomposition of left kronecker product matrix."}, + ) + decompose_factor: int = field(default=-1, metadata={"help": "Kronecker product decomposition factor."}) + target_modules: Optional[Union[List[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with LoKr." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the LoKr layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[List[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[str] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." + }, + ) + modules_to_save: Optional[List[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from LoKr layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + + def __post_init__(self): + self.peft_type = PeftType.LOKR diff --git a/MoRA/peft_mora/tuners/lokr/layer.py b/MoRA/peft_mora/tuners/lokr/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..1a961d19440cb3ea10386b6ca77d79a093c34016 --- /dev/null +++ b/MoRA/peft_mora/tuners/lokr/layer.py @@ -0,0 +1,409 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Any, Optional, Set, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from peft_mora.tuners.lycoris_utils import LycorisLayer + + +class LoKrLayer(nn.Module, LycorisLayer): + # All names of layers that may contain adapter weights + adapter_layer_names = ( + "lokr_w1", + "lokr_w1_a", + "lokr_w1_b", + "lokr_w2", + "lokr_w2_a", + "lokr_w2_b", + "lokr_t2", + ) + # other_param_names is defined on parent class + + def __init__(self, base_layer: nn.Module) -> None: + super().__init__() + LycorisLayer.__init__(self, base_layer) + + # LoKr info + self.lokr_w1 = nn.ParameterDict({}) + self.lokr_w1_a = nn.ParameterDict({}) + self.lokr_w1_b = nn.ParameterDict({}) + self.lokr_w2 = nn.ParameterDict({}) + self.lokr_w2_a = nn.ParameterDict({}) + self.lokr_w2_b = nn.ParameterDict({}) + self.lokr_t2 = nn.ParameterDict({}) + + @property + def _available_adapters(self) -> Set[str]: + return { + *self.lokr_w1, + *self.lokr_w1_a, + *self.lokr_w1_b, + *self.lokr_w2, + *self.lokr_w2_a, + *self.lokr_w2_b, + *self.lokr_t2, + } + + def create_adapter_parameters( + self, + adapter_name: str, + r: int, + shape, + use_w1: bool, + use_w2: bool, + use_effective_conv2d: bool, + ): + if use_w1: + self.lokr_w1[adapter_name] = nn.Parameter(torch.empty(shape[0][0], shape[1][0])) + else: + self.lokr_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0][0], r)) + self.lokr_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][0])) + + if len(shape) == 4: + # Conv2d + if use_w2: + self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1], *shape[2:])) + elif use_effective_conv2d: + self.lokr_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3])) + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0][1])) # b, 1-mode + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1])) # d, 2-mode + else: + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r)) + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1] * shape[2] * shape[3])) + else: + # Linear + if use_w2: + self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1])) + else: + self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r)) + self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1])) + + def reset_adapter_parameters(self, adapter_name: str): + if adapter_name in self.lokr_w1: + nn.init.zeros_(self.lokr_w1[adapter_name]) + else: + nn.init.zeros_(self.lokr_w1_a[adapter_name]) + nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_w2: + nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5)) + else: + nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_t2: + nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5)) + + def reset_adapter_parameters_random(self, adapter_name: str): + if adapter_name in self.lokr_w1: + nn.init.kaiming_uniform_(self.lokr_w1[adapter_name], a=math.sqrt(5)) + else: + nn.init.kaiming_uniform_(self.lokr_w1_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_w2: + nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5)) + else: + nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5)) + nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5)) + + if adapter_name in self.lokr_t2: + nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5)) + + def update_layer( + self, + adapter_name: str, + r: int, + alpha: float, + rank_dropout: float, + module_dropout: float, + init_weights: bool, + use_effective_conv2d: bool, + decompose_both: bool, + decompose_factor: int, + **kwargs, + ) -> None: + """Internal function to create lokr adapter + + Args: + adapter_name (`str`): Name for the adapter to add. + r (`int`): Rank for the added adapter. + alpha (`float`): Alpha for the added adapter. + rank_dropout (`float`): The dropout probability for rank dimension during training + module_dropout (`float`): The dropout probability for disabling adapter during training. + init_weights (`bool`): Whether to initialize adapter weights. + use_effective_conv2d (`bool`): Use parameter effective decomposition for Conv2d with ksize > 1. + decompose_both (`bool`): Perform rank decomposition of left kronecker product matrix. + decompose_factor (`int`): Kronecker product decomposition factor. + """ + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.alpha[adapter_name] = alpha + self.scaling[adapter_name] = alpha / r + self.rank_dropout[adapter_name] = rank_dropout + self.module_dropout[adapter_name] = module_dropout + base_layer = self.get_base_layer() + + # Determine shape of LoKr weights + if isinstance(base_layer, nn.Linear): + in_dim, out_dim = base_layer.in_features, base_layer.out_features + + in_m, in_n = factorization(in_dim, decompose_factor) + out_l, out_k = factorization(out_dim, decompose_factor) + shape = ((out_l, out_k), (in_m, in_n)) # ((a, b), (c, d)), out_dim = a*c, in_dim = b*d + + use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) + use_w2 = not (r < max(shape[0][1], shape[1][1]) / 2) + use_effective_conv2d = False + elif isinstance(base_layer, nn.Conv2d): + in_dim, out_dim = base_layer.in_channels, base_layer.out_channels + k_size = base_layer.kernel_size + + in_m, in_n = factorization(in_dim, decompose_factor) + out_l, out_k = factorization(out_dim, decompose_factor) + shape = ((out_l, out_k), (in_m, in_n), *k_size) # ((a, b), (c, d), *k_size) + + use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2) + use_w2 = r >= max(shape[0][1], shape[1][1]) / 2 + use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1) + else: + raise TypeError(f"LoKr is not implemented for base layers of type {type(base_layer).__name__}") + + # Create weights with provided shape + self.create_adapter_parameters(adapter_name, r, shape, use_w1, use_w2, use_effective_conv2d) + + # Initialize weights + if init_weights: + self.reset_adapter_parameters(adapter_name) + else: + self.reset_adapter_parameters_random(adapter_name) + + # Move new weights to device + weight = getattr(self.get_base_layer(), "weight", None) + if weight is not None: + # the layer is already completely initialized, this is an update + if weight.dtype.is_floating_point or weight.dtype.is_complex: + self.to(weight.device, dtype=weight.dtype) + else: + self.to(weight.device) + self.set_adapter(self.active_adapters) + + def get_delta_weight(self, adapter_name: str) -> torch.Tensor: + # https://github.com/KohakuBlueleaf/LyCORIS/blob/e4259b870d3354a9615a96be61cb5d07455c58ea/lycoris/modules/lokr.py#L224 + if adapter_name in self.lokr_w1: + w1 = self.lokr_w1[adapter_name] + else: + w1 = self.lokr_w1_a[adapter_name] @ self.lokr_w1_b[adapter_name] + + if adapter_name in self.lokr_w2: + w2 = self.lokr_w2[adapter_name] + elif adapter_name in self.lokr_t2: + w2 = make_weight_cp(self.lokr_t2[adapter_name], self.lokr_w2_a[adapter_name], self.lokr_w2_b[adapter_name]) + else: + w2 = self.lokr_w2_a[adapter_name] @ self.lokr_w2_b[adapter_name] + + # Make weights with Kronecker product + weight = make_kron(w1, w2) + weight = weight.reshape(self.get_base_layer().weight.shape) + + # Perform rank dropout during training - drop rows of addition weights + rank_dropout = self.rank_dropout[adapter_name] + if self.training and rank_dropout: + drop = (torch.rand(weight.size(0)) > rank_dropout).float() + drop = drop.view(-1, *[1] * len(weight.shape[1:])).to(weight.device) + drop /= drop.mean() + weight *= drop + + return weight + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + + # Execute all the adapters + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + module_dropout = self.module_dropout[active_adapter] + + # Modify current execution weights + if (not self.training) or (self.training and torch.rand(1) > module_dropout): + result = result + self._get_delta_activations(active_adapter, x, *args, **kwargs) + + result = result.to(previous_dtype) + return result + + +class Linear(LoKrLayer): + """LoKr implemented in Linear layer""" + + def __init__( + self, + base_layer: nn.Module, + device: Optional[Union[str, torch.device]] = None, + dtype: Optional[torch.dtype] = None, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + # don't add bias here, because the bias is already included in the output of the base_layer + return F.linear(input, delta_weight) + + def __repr__(self) -> str: + rep = super().__repr__() + return "lokr." + rep + + +class Conv2d(LoKrLayer): + """LoKr implemented in Conv2d layer""" + + def __init__( + self, + base_layer: nn.Module, + device: Optional[Union[str, torch.device]] = None, + dtype: Optional[torch.dtype] = None, + adapter_name: str = "default", + r: int = 0, + alpha: float = 0.0, + rank_dropout: float = 0.0, + module_dropout: float = 0.0, + use_effective_conv2d: bool = False, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer( + adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs + ) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + # don't add bias here, because the bias is already included in the output of the base_layer + base_layer = self.get_base_layer() + return F.conv2d( + input, + delta_weight, + stride=base_layer.stride, + padding=base_layer.padding, + dilation=base_layer.dilation, + groups=base_layer.groups, + ) + + def __repr__(self) -> str: + rep = super().__repr__() + return "lokr." + rep + + +# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py#L11 + + +def factorization(dimension: int, factor: int = -1) -> Tuple[int, int]: + """Factorizes the provided number into the product of two numbers + + Args: + dimension (`int`): The number that needs to be factorized. + factor (`int`, optional): + Factorization divider. The algorithm will try to output two numbers, one of each will be as close to the + factor as possible. If -1 is provided, the decomposition algorithm would try to search dividers near the + square root of the dimension. Defaults to -1. + + Returns: + Tuple[`int`, `int`]: A tuple of two numbers, whose product is equal to the provided number. The first number is + always less than or equal to the second. + + Example: + ```py + >>> factorization(256, factor=-1) + (16, 16) + + >>> factorization(128, factor=-1) + (8, 16) + + >>> factorization(127, factor=-1) + (1, 127) + + >>> factorization(128, factor=4) + (4, 32) + ``` + """ + + if factor > 0 and (dimension % factor) == 0: + m = factor + n = dimension // factor + return m, n + if factor == -1: + factor = dimension + m, n = 1, dimension + length = m + n + while m < n: + new_m = m + 1 + while dimension % new_m != 0: + new_m += 1 + new_n = dimension // new_m + if new_m + new_n > length or new_m > factor: + break + else: + m, n = new_m, new_n + if m > n: + n, m = m, n + return m, n + + +def make_weight_cp(t, wa, wb): + rebuild2 = torch.einsum("i j k l, i p, j r -> p r k l", t, wa, wb) # [c, d, k1, k2] + return rebuild2 + + +def make_kron(w1, w2, scale=1.0): + if len(w2.shape) == 4: + w1 = w1.unsqueeze(2).unsqueeze(2) + w2 = w2.contiguous() + rebuild = torch.kron(w1, w2) + + return rebuild * scale diff --git a/MoRA/peft_mora/tuners/lokr/model.py b/MoRA/peft_mora/tuners/lokr/model.py new file mode 100644 index 0000000000000000000000000000000000000000..b3e1c930b3fb7b7889df6a62b0fe84d4d0f0eaa3 --- /dev/null +++ b/MoRA/peft_mora/tuners/lokr/model.py @@ -0,0 +1,115 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from itertools import chain +from typing import Dict, Type, Union + +import torch +from torch import nn + +from peft_mora.tuners.lycoris_utils import LycorisConfig, LycorisTuner + +from .layer import Conv2d, Linear, LoKrLayer + + +class LoKrModel(LycorisTuner): + """ + Creates Low-Rank Kronecker Product model from a pretrained model. The original method is partially described in + https://arxiv.org/abs/2108.06098 and in https://arxiv.org/abs/2309.14859 Current implementation heavily borrows + from + https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py + + Args: + model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached. + config ([`LoKrConfig`]): The configuration of the LoKr model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + + Returns: + `torch.nn.Module`: The LoKr model. + + Example: + ```py + >>> from diffusers import StableDiffusionPipeline + >>> from peft import LoKrModel, LoKrConfig + + >>> config_te = LoKrConfig( + ... r=8, + ... lora_alpha=32, + ... target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + ... rank_dropout=0.0, + ... module_dropout=0.0, + ... init_weights=True, + ... ) + >>> config_unet = LoKrConfig( + ... r=8, + ... lora_alpha=32, + ... target_modules=[ + ... "proj_in", + ... "proj_out", + ... "to_k", + ... "to_q", + ... "to_v", + ... "to_out.0", + ... "ff.net.0.proj", + ... "ff.net.2", + ... ], + ... rank_dropout=0.0, + ... module_dropout=0.0, + ... init_weights=True, + ... use_effective_conv2d=True, + ... ) + + >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> model.text_encoder = LoKrModel(model.text_encoder, config_te, "default") + >>> model.unet = LoKrModel(model.unet, config_unet, "default") + ``` + + **Attributes**: + - **model** ([`~torch.nn.Module`]) -- The model to be adapted. + - **peft_config** ([`LoKrConfig`]): The configuration of the LoKr model. + """ + + prefix: str = "lokr_" + layers_mapping: Dict[Type[torch.nn.Module], Type[LoKrLayer]] = { + torch.nn.Conv2d: Conv2d, + torch.nn.Linear: Linear, + } + + def _create_and_replace( + self, + config: LycorisConfig, + adapter_name: str, + target: Union[LoKrLayer, nn.Module], + target_name: str, + parent: nn.Module, + current_key: str, + ) -> None: + """ + A private method to create and replace the target module with the adapter module. + """ + + # Regexp matching - Find key which matches current target_name in patterns provided + pattern_keys = list(chain(config.rank_pattern.keys(), config.alpha_pattern.keys())) + target_name_key = next(filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys), target_name) + + kwargs = config.to_dict() + kwargs["r"] = config.rank_pattern.get(target_name_key, config.r) + kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha) + + if isinstance(target, LoKrLayer): + target.update_layer(adapter_name, **kwargs) + else: + new_module = self._create_new_module(config, adapter_name, target, **kwargs) + self._replace_module(parent, target_name, new_module, target) diff --git a/MoRA/peft_mora/tuners/lora.py b/MoRA/peft_mora/tuners/lora.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/MoRA/peft_mora/tuners/lora/__init__.py b/MoRA/peft_mora/tuners/lora/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8a81e5f36933d8f898acb72edee441b3e3d4537d --- /dev/null +++ b/MoRA/peft_mora/tuners/lora/__init__.py @@ -0,0 +1,37 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available + +from .config import LoftQConfig, LoraConfig +from .gptq import QuantLinear +from .layer import Conv2d, Embedding, Linear, LoraLayer +from .model import LoraModel + + +__all__ = ["LoraConfig", "LoftQConfig", "Conv2d", "Embedding", "LoraLayer", "Linear", "LoraModel", "QuantLinear"] + + +def __getattr__(name): + if (name == "Linear8bitLt") and is_bnb_available(): + from .bnb import Linear8bitLt + + return Linear8bitLt + + if (name == "Linear4bit") and is_bnb_4bit_available(): + from .bnb import Linear4bit + + return Linear4bit + + raise AttributeError(f"module {__name__} has no attribute {name}") diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df371c5eba4279f6ba9a4e8cc903d6a51c902000 Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/aqlm.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/aqlm.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..87581be9e84c5043d9e880713bc2b79c7d3b5cf4 Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/aqlm.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/awq.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/awq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edc3a080105b1f0b95c460e00428a06954b29c04 Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/awq.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/bnb.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/bnb.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d89f91ff3ccb16f1c76816369ee09d4bf64a10c4 Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/bnb.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0d5a321b3b9c91688c786754ba4f17aa3a19593 Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/gptq.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/gptq.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57ab2777599acd805c821caa445dd48bc83c875d Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/gptq.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/layer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ab79867105c22f997466ea40f3658183775c5c7 Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/layer.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1235d0ca7be5bd6c6b13e7e8d10ebdc0916e9d61 Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/tp_layer.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/tp_layer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..05fe836dd5476a51ea5764cbe304d176036cd4ae Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/tp_layer.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/lora/aqlm.py b/MoRA/peft_mora/tuners/lora/aqlm.py new file mode 100644 index 0000000000000000000000000000000000000000..6b359b3c86a6db5b643928e4f52e5fe6f72bd421 --- /dev/null +++ b/MoRA/peft_mora/tuners/lora/aqlm.py @@ -0,0 +1,100 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional + +import torch + +from peft_mora.import_utils import is_aqlm_available +from peft_mora.tuners.lora.layer import LoraLayer +from peft_mora.tuners.tuners_utils import BaseTunerLayer + + +if is_aqlm_available(): + from aqlm import QuantizedLinear + + +class AqlmLoraLinear(torch.nn.Module, LoraLayer): + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + **kwargs, + ): + super().__init__() + LoraLayer.__init__(self, base_layer) + + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora) + + def forward(self, x: torch.Tensor): + # note: logic differs from default Linear because merging is not supported + result = self.base_layer(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = x.to(lora_A.weight.dtype) + + output = lora_B(lora_A(dropout(x))) + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling + result += output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102 + # def reset_lora_parameters(self, adapter_name): + # if adapter_name in self.lora_A.keys(): + # torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight) + # torch.nn.init.zeros_(self.lora_B[adapter_name].weight) + + +def dispatch_aqlm( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_aqlm_available() and isinstance(target_base_layer, QuantizedLinear): + new_module = AqlmLoraLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.codes + + return new_module diff --git a/MoRA/peft_mora/tuners/lora/awq.py b/MoRA/peft_mora/tuners/lora/awq.py new file mode 100644 index 0000000000000000000000000000000000000000..3ad45be6c9e7b88ff2856ae66e851e56f5da0f39 --- /dev/null +++ b/MoRA/peft_mora/tuners/lora/awq.py @@ -0,0 +1,108 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib.metadata as importlib_metadata +from typing import Any, Optional + +import packaging.version +import torch + +from peft_mora.import_utils import is_auto_awq_available +from peft_mora.tuners.lora.layer import LoraLayer +from peft_mora.tuners.tuners_utils import BaseTunerLayer + + +if is_auto_awq_available(): + from awq.modules.linear import WQLinear_GEMM + + +class AwqLoraLinear(torch.nn.Module, LoraLayer): + def __init__( + self, + base_layer, + adapter_name, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + **kwargs, + ): + super().__init__() + LoraLayer.__init__(self, base_layer) + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora) + + def forward(self, x: torch.Tensor): + result = self.quant_linear_module(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = x.to(lora_A.weight.dtype) + + output = lora_B(lora_A(dropout(x))) + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling + result = result + output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +def dispatch_awq( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if is_auto_awq_available() and isinstance(target_base_layer, WQLinear_GEMM): + # Raise the error only at the dispatch level + AUTOAWQ_MINIMUM_VERSION = packaging.version.parse("0.2.0") + version_autoawq = packaging.version.parse(importlib_metadata.version("autoawq")) + + if AUTOAWQ_MINIMUM_VERSION > version_autoawq: + raise ImportError( + f"Found an incompatible version of auto-awq. Found version {version_autoawq}, " + f"but only versions above {AUTOAWQ_MINIMUM_VERSION} are supported for PEFT." + ) + + new_module = AwqLoraLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.qweight + + return new_module diff --git a/MoRA/peft_mora/tuners/lora/bnb.py b/MoRA/peft_mora/tuners/lora/bnb.py new file mode 100644 index 0000000000000000000000000000000000000000..605a50b783d3b80e4871adaf5bec88151110751b --- /dev/null +++ b/MoRA/peft_mora/tuners/lora/bnb.py @@ -0,0 +1,399 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import List, Optional + +import bitsandbytes as bnb +import torch + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available +from peft_mora.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft_mora.utils.other import transpose + +from .layer import LoraLayer + + +if is_bnb_available(): + + class Linear8bitLt(torch.nn.Module, LoraLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + use_mora: bool = False, + mora_type: int = 1, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer) + + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + use_mora=use_mora, + mora_type=mora_type, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter not in self.lora_A.keys(): + continue + warnings.warn( + "Merge lora module to 8-bit linear may get different generations due to rounding errors." + ) + lora_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + + # Dequantize the result of identity matrix and int8 weight because bitsandbytes does not support int8 + # dequantization directly + im = torch.eye(weight.data.shape[-1]).contiguous().half().to(weight.device) + im, imt, SCim, SCimt, coo_tensorim = bnb.functional.double_quant(im) + im, Sim = bnb.functional.transform(im, "col32") + if state.CxB is None: + state.CxB, state.SB = bnb.functional.transform(weight.data, to_order=state.formatB) + out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB) + output = bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t() + + w_data = output.to(lora_data.dtype).to(lora_data.device) + lora_data + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + state.reset_grads() + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.lora_A.keys(): + continue + warnings.warn( + "Unmerge lora module to 8-bit linear may get different generations due to rounding errors." + ) + lora_data = self.get_delta_weight(active_adapter) + + weight = self.get_base_layer().weight + state = self.get_base_layer().state + if state.SCB is None: + state.SCB = weight.SCB + im = torch.eye(weight.data.shape[-1]).contiguous().half().to(weight.device) + im, imt, SCim, SCimt, coo_tensorim = bnb.functional.double_quant(im) + im, Sim = bnb.functional.transform(im, "col32") + + if state.CxB is None: + state.CxB, state.SB = bnb.functional.transform(weight.data, to_order=state.formatB) + out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB) + output = bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t() + + w_data = output.to(lora_data.dtype).to(lora_data.device) - lora_data + self.get_base_layer().weight = bnb.nn.Int8Params( + w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights + ).to(weight.device) + state.reset_grads() + + def get_delta_weight(self, adapter): + return ( + transpose( + self.lora_B[adapter].weight @ self.lora_A[adapter].weight, + False, + ) + * self.scaling[adapter] + ) + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + compute_dtype = lora_A.weight.dtype + if x.dtype != compute_dtype: + x = x.to(compute_dtype) + output = lora_B(lora_A(dropout(x))) + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling + result = result + output + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + def dispatch_bnb_8bit(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + loaded_in_8bit = kwargs.get("loaded_in_8bit", False) + if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt): + eightbit_kwargs = kwargs.copy() + eightbit_kwargs.update( + { + "has_fp16_weights": target.state.has_fp16_weights, + "memory_efficient_backward": target.state.memory_efficient_backward, + "threshold": target.state.threshold, + "index": target.index, + } + ) + new_module = Linear8bitLt(target, adapter_name, **eightbit_kwargs) + + return new_module + + +if is_bnb_4bit_available(): + + class Linear4bit(torch.nn.Module, LoraLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer: torch.nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + use_mora: bool = False, + mora_type: int = 1, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer) + + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + use_mora=use_mora, + mora_type=mora_type, + ) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter not in self.lora_A.keys(): + continue + warnings.warn( + "Merge lora module to 4-bit linear may get different generations due to rounding errors." + ) + # Refer to https://gist.github.com/ChrisHayduk/1a53463331f52dca205e55982baf9930 + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + lora_data = self.get_delta_weight(active_adapter) + + w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) + lora_data + if safe_merge and not torch.isfinite(w_data).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + if "bnb_quantized" in kwargs: + kwargs["bnb_quantized"] = False + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to( + weight.device + ) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter not in self.lora_A.keys(): + continue + warnings.warn( + "Unmerge lora module to 4-bit linear may get different generations due to rounding errors." + ) + weight = self.get_base_layer().weight + kwargs = weight.__dict__ + lora_data = self.get_delta_weight(active_adapter) + w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) - lora_data + if "bnb_quantized" in kwargs: + kwargs["bnb_quantized"] = False + self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to( + weight.device + ) + + def get_delta_weight(self, adapter): + return ( + transpose( + self.lora_B[adapter].weight @ self.lora_A[adapter].weight, + False, + ) + * self.scaling[adapter] + ) + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + # As per Tim Dettmers, for 4bit, we need to defensively clone here. + # The reason is that in some cases, an error can occur that backprop + # does not work on a manipulated view. This issue may be solved with + # newer PyTorch versions but this would need extensive testing to be + # sure. + result = result.clone() + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = x.to(lora_A.weight.dtype) + + if self.use_mora[active_adapter]: + x = dropout(x) + output = self._apply_mora(x, lora_A, lora_B, scaling, active_adapter) + else: + output = lora_B(lora_A(dropout(x))) + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling + result = result + output + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + def dispatch_bnb_4bit(target: torch.nn.Module, adapter_name: str, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + loaded_in_4bit = kwargs.get("loaded_in_4bit", False) + if loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit): + fourbit_kwargs = kwargs.copy() + fourbit_kwargs.update( + { + "compute_dtype": target_base_layer.compute_dtype, + "compress_statistics": target_base_layer.weight.compress_statistics, + "quant_type": target_base_layer.weight.quant_type, + } + ) + new_module = Linear4bit(target, adapter_name, **fourbit_kwargs) + + return new_module diff --git a/MoRA/peft_mora/tuners/lora/config.py b/MoRA/peft_mora/tuners/lora/config.py new file mode 100644 index 0000000000000000000000000000000000000000..fe0d567efc0c65cced8008680542b27a8a53eb0e --- /dev/null +++ b/MoRA/peft_mora/tuners/lora/config.py @@ -0,0 +1,292 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from peft_mora.config import PeftConfig +from peft_mora.utils import PeftType + + +@dataclass +class LoftQConfig: + """ + This is the sub-configuration class to store the configuration of a [`LoraModel`]. + + Args: + bits_pattern (`dict`): The mapping from layer names or regexp expression to bits which are different from the + default bits specified by `bits`. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 2`}. + bits (`int`): Quantization bits for LoftQ. + iter (`int`): Alternating iterations for LoftQ. + fake (`bool`): True: use fp16/fp32; used for first time to save weights. False: use bitsandbytes 4bit linear + models. weights can't be saved. Recommend to set to True, save the weights and load the saved weights in 4 + bits. + """ + + loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) + loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) + + +@dataclass +class LoraConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`LoraModel`]. + + Args: + r (`int`): + Lora attention dimension (the "rank"). + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, + excluding the output layer. If this is not specified, modules will be chosen according to the model + architecture. If the architecture is not known, an error will be raised -- in this case, you should specify + the target modules manually. + lora_alpha (`int`): + The alpha parameter for Lora scaling. + lora_dropout (`float`): + The dropout probability for Lora layers. + fan_in_fan_out (`bool`): + Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses + `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. + bias (`str`): + Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the corresponding biases + will be updated during training. Be aware that this means that, even when disabling the adapters, the model + will not produce the same output as the base model would have without adaptation. + use_rslora (`bool`): + When set to True, uses Rank-Stabilized LoRA which + sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it was proven to work better. + Otherwise, it will use the original default value of `lora_alpha/r`. + modules_to_save (`List[str]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + init_lora_weights (`bool` | `Literal["gaussian", "loftq"]`): + How to initialize the weights of the adapter layers. Passing True (default) results in the default + initialization from the reference implementation from Microsoft. Passing 'gaussian' results in Gaussian + initialization scaled by the LoRA rank for linear and layers. Setting the initialization to False leads to + completely random initialization and is discouraged. Pass `'loftq'` to use LoftQ initialization. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`str`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. + rank_pattern (`dict`): + The mapping from layer names or regexp expression to ranks which are different from the default rank + specified by `r`. + alpha_pattern (`dict`): + The mapping from layer names or regexp expression to alphas which are different from the default alpha + specified by `lora_alpha`. + megatron_config (`Optional[dict]`): + The TransformerConfig arguments for Megatron. It is used to create LoRA's parallel linear layer. You can + get it like this, `core_transformer_config_from_args(get_args())`, these two functions being from Megatron. + The arguments will be used to initialize the TransformerConfig of Megatron. You need to specify this + parameter when you want to apply LoRA to the ColumnParallelLinear and RowParallelLinear layers of megatron. + megatron_core (`Optional[str]`): + The core module from Megatron to use, defaults to `"megatron.core"`. + loftq_config (`Optional[LoftQConfig]`): + The configuration of LoftQ. If this is not None, then LoftQ will be used to quantize the backbone weights + and initialize Lora layers. Also pass `init_lora_weights='loftq'`. Note that you should not pass a + quantized model in this case, as LoftQ will quantize the model itself. + use_dora (`bool`): + Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the weights + into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is + handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low + ranks. Right now, DoRA only supports non-quantized linear layers. DoRA introduces a bigger overhead than + pure LoRA, so it is recommended to merge weights for inference. For more information, see + https://arxiv.org/abs/2402.09353. + """ + + r: int = field(default=8, metadata={"help": "Lora attention dimension"}) + target_modules: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": ( + "List of module names or regex expression of the module names to replace with LoRA." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'." + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + "If not specified, modules will be chosen according to the model architecture, If the architecture is " + "not known, an error will be raised -- in this case, you should specify the target modules manually." + ), + }, + ) + lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"}) + lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"}) + fan_in_fan_out: bool = field( + default=False, + metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, + ) + bias: Literal["none", "all", "lora_only"] = field( + default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"} + ) + use_rslora: bool = field( + default=False, + metadata={ + "help": ( + "When set to True, uses Rank-Stabilized LoRA doi.org/10.48550/arXiv.2312.03732" + " which sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it" + " was proven to work better. Otherwise, it will use the original default" + " value of `lora_alpha/r`." + ) + }, + ) + modules_to_save: Optional[list[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + init_lora_weights: bool | Literal["gaussian", "loftq"] = field( + default=True, + metadata={ + "help": ( + "How to initialize the weights of the LoRA layers. Passing True (default) results in the default " + "initialization from the reference implementation from Microsoft. Passing 'gaussian' results " + "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization " + "to False leads to completely random initialization and is discouraged." + "Pass `'loftq'` to use LoftQ initialization" + ), + }, + ) + layers_to_transform: Optional[Union[list[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. " + "This only works when target_modules is a list of str." + }, + ) + layers_pattern: Optional[Union[list[str], str]] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." + "This only works when target_modules is a list of str." + }, + ) + rank_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. " + "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}" + ) + }, + ) + alpha_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`. " + "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}" + ) + }, + ) + megatron_config: Optional[dict] = field( + default=None, + metadata={ + "help": ( + "The TransformerConfig from Megatron. It is used to create LoRA's parallel linear layer." + "You can get it like this, `core_transformer_config_from_args(get_args())`, " + "these two functions being from Megatron." + "You need to specify this parameter when you want to apply LoRA to the ColumnParallelLinear and " + "RowParallelLinear layers of megatron." + "It should be noted that we may not be able to use the `save_pretrained` and `from_pretrained` " + "functions, because TransformerConfig may not necessarily be serialized." + "But when using megatron, we can use `get_peft_model_state_dict` function and " + "megatron's framework, they can also save and load models and configurations." + ) + }, + ) + megatron_core: Optional[str] = field( + default="megatron.core", + metadata={ + "help": ( + "The core module from Megatron, it is used to create LoRA's parallel linear layer. " + "It only needs to be passed in when you need to use your own modified megatron core module. " + "Otherwise, it will use the default value `megatron.core`. " + ) + }, + ) + # dict type is used when loading config.json + loftq_config: Union[LoftQConfig, dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The configuration of LoftQ. If this is passed, then LoftQ will be used to quantize the backbone " + "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` in this case." + ) + }, + ) + use_dora: bool = field( + default=False, + metadata={ + "help": ( + "Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the " + "weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the " + "magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, " + "especially at low ranks. Right now, DoRA only supports non-quantized linear layers. DoRA introduces " + "a bigger overhead than pure LoRA, so it is recommended to merge weights for inference. For more " + "information, see https://arxiv.org/abs/2402.09353." + ) + }, + ) + use_mora: bool = field( + default=False, + metadata={ + "help": ( + "Enable MoRA" + ) + }, + ) + + mora_type: int = field( + default=1, + metadata={ + "help": ( + "Enable MoRA" + ) + }, + ) + + def __post_init__(self): + self.peft_type = PeftType.LORA + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) + # if target_modules is a regex expression, then layers_to_transform should be None + if isinstance(self.target_modules, str) and self.layers_to_transform is not None: + raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + + # if target_modules is a regex expression, then layers_pattern should be None + if isinstance(self.target_modules, str) and self.layers_pattern is not None: + raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") + + if self.use_dora and (self.megatron_config or self.init_lora_weights == "loftq"): + raise ValueError("DoRA does not support megatron_core or LoftQ. Please set `use_dora=False`.") + + # handle init_lora_weights and loftq_config + if self.init_lora_weights == "loftq": + import importlib + + if not importlib.util.find_spec("scipy"): + raise ImportError("The required package 'scipy' is not installed. Please install it to continue.") + if self.loftq_config is None: + raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.") + + # convert loftq_config to dict + if self.loftq_config and not isinstance(self.loftq_config, dict): + self.loftq_config = vars(self.loftq_config) diff --git a/MoRA/peft_mora/tuners/lora/gptq.py b/MoRA/peft_mora/tuners/lora/gptq.py new file mode 100644 index 0000000000000000000000000000000000000000..ee36adda1d7c734649600422b608808f077a0834 --- /dev/null +++ b/MoRA/peft_mora/tuners/lora/gptq.py @@ -0,0 +1,114 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Optional + +import torch + +from peft_mora.tuners.lora.layer import LoraLayer +from peft_mora.tuners.tuners_utils import BaseTunerLayer +from peft_mora.utils import get_auto_gptq_quant_linear + + +class QuantLinear(torch.nn.Module, LoraLayer): + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + **kwargs, + ): + super().__init__() + LoraLayer.__init__(self, base_layer) + + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter + # for backwards compatibility + self.quant_linear_module = base_layer + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + ) + + def forward(self, x: torch.Tensor): + # note: logic differs from default Linear because merging is not supported + result = self.quant_linear_module(x) + + if self.disable_adapters: + return result + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + requires_conversion = not torch.is_autocast_enabled() + if requires_conversion: + expected_dtype = result.dtype + x = x.to(lora_A.weight.dtype) + + output = lora_B(lora_A(dropout(x))) + if requires_conversion: + output = output.to(expected_dtype) + output = output * scaling + result += output + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102 + # def reset_lora_parameters(self, adapter_name): + # if adapter_name in self.lora_A.keys(): + # torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight) + # torch.nn.init.zeros_(self.lora_B[adapter_name].weight) + + +def dispatch_gptq( + target: torch.nn.Module, + adapter_name: str, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + gptq_quantization_config = kwargs.get("gptq_quantization_config", None) + AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config) + + if AutoGPTQQuantLinear is not None and isinstance(target_base_layer, AutoGPTQQuantLinear): + new_module = QuantLinear(target, adapter_name, **kwargs) + target.qweight = target_base_layer.qweight + + return new_module diff --git a/MoRA/peft_mora/tuners/lora/layer.py b/MoRA/peft_mora/tuners/lora/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..d3b1423fefce5804cc7025923b7d6b9c0e6dda29 --- /dev/null +++ b/MoRA/peft_mora/tuners/lora/layer.py @@ -0,0 +1,1096 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import warnings +from typing import Any, List, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.pytorch_utils import Conv1D + +from peft_mora.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft_mora.utils.integrations import gather_params_ctx +from peft_mora.utils.other import transpose + +from .config import LoraConfig + + +class LoraLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B") + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout") + + def __init__(self, base_layer: nn.Module, **kwargs) -> None: + self.base_layer = base_layer + self.r = {} + self.lora_alpha = {} + self.scaling = {} + self.lora_dropout = nn.ModuleDict({}) + self.lora_A = nn.ModuleDict({}) + self.lora_B = nn.ModuleDict({}) + # For Embedding layer + self.lora_embedding_A = nn.ParameterDict({}) + self.lora_embedding_B = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + self.use_dora: dict[str, bool] = {} + self.lora_magnitude_vector: Optional[torch.nn.ParameterDict] = None # for DoRA + self._caches: dict[str, Any] = {} + self.kwargs = kwargs + + self.use_mora: dict[str, bool] = {} + + self.mora_type: dict[str, int] = {} + + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + elif isinstance(base_layer, nn.Conv2d): + in_features, out_features = base_layer.in_channels, base_layer.out_channels + elif isinstance(base_layer, nn.Embedding): + in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim + elif isinstance(base_layer, Conv1D): + in_features, out_features = ( + base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + ) + elif hasattr(base_layer, "infeatures") and hasattr(base_layer, "outfeatures"): + # QuantLinear + in_features, out_features = base_layer.infeatures, base_layer.outfeatures + elif hasattr(base_layer, "input_size") and hasattr(base_layer, "output_size"): + # Megatron ColumnParallelLinear,RowParallelLinear + in_features, out_features = base_layer.input_size, base_layer.output_size + elif hasattr(base_layer, "codebooks") and base_layer.__class__.__name__ == "QuantizedLinear": + # AQLM QuantLinear + in_features, out_features = base_layer.in_features, base_layer.out_features + elif hasattr(base_layer, "w_bit") and base_layer.__class__.__name__ == "WQLinear_GEMM": + # Awq layers + in_features, out_features = base_layer.in_features, base_layer.out_features + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + self.in_features = in_features + self.out_features = out_features + + def update_layer( + self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora: bool = False, + use_mora: bool = False, mora_type: int = 1, + ): + # This code works for linear layers, override for other layer types + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer})) + + self.use_mora[adapter_name] = False + self.mora_type[adapter_name] = mora_type + + if use_mora: + new_r = int(math.sqrt((self.in_features + self.out_features)*r)+0.5) + if mora_type == 6: + # type 6 require new_r to be even for RoPE + new_r = new_r//2*2 + + self.lora_A[adapter_name] = nn.Linear(new_r, new_r, bias=False) + self.r[adapter_name] = new_r + + nn.init.zeros_(self.lora_A[adapter_name].weight) + self.lora_B[adapter_name] = self.lora_A[adapter_name] + self.use_mora[adapter_name] = True + self.scaling[adapter_name] = 1.0 + else: + # Actual trainable parameters + self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False) + self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=False) + if use_rslora: + self.scaling[adapter_name] = lora_alpha / math.sqrt(r) + else: + self.scaling[adapter_name] = lora_alpha / r + + if init_lora_weights == "loftq": + self.loftq_init(adapter_name) + elif init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + + # check weight and qweight (for GPTQ) + for weight_name in ("weight", "qweight"): + weight = getattr(self.get_base_layer(), weight_name, None) + if weight is not None: + # the layer is already completely initialized, this is an update + if weight.dtype.is_floating_point or weight.dtype.is_complex: + self.to(weight.device, dtype=weight.dtype) + else: + self.to(weight.device) + break + + if use_dora: + self.dora_init(adapter_name) + self.use_dora[adapter_name] = True + else: + self.use_dora[adapter_name] = False + + self.set_adapter(self.active_adapters) + + def reset_lora_parameters(self, adapter_name, init_lora_weights, mora_type=None): + if init_lora_weights is False: + return + + if self.use_mora[adapter_name]: + nn.init.zeros_(self.lora_A[adapter_name].weight) + self.lora_B[adapter_name] = self.lora_A[adapter_name] + if mora_type is not None: + self.mora_type[adapter_name] = mora_type + return + + if adapter_name in self.lora_A.keys(): + if init_lora_weights is True: + # initialize A the same way as the default for nn.Linear and B to zero + # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124 + nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5)) + elif init_lora_weights.lower() == "gaussian": + nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name]) + else: + raise ValueError(f"Unknown initialization {init_lora_weights=}") + nn.init.zeros_(self.lora_B[adapter_name].weight) + if adapter_name in self.lora_embedding_A.keys(): + # initialize a the same way as the default for nn.linear and b to zero + nn.init.zeros_(self.lora_embedding_A[adapter_name]) + nn.init.normal_(self.lora_embedding_B[adapter_name]) + + def loftq_init(self, adapter_name): + from peft_mora.utils.loftq_utils import loftq_init + + weight = self.get_base_layer().weight + kwargs = { + "num_bits": self.kwargs.get("loftq_bits", 4), + "reduced_rank": self.r[adapter_name], + "num_iter": self.kwargs.get("loftq_iter", 1), + } + + qweight, lora_A, lora_B = loftq_init(weight, **kwargs) + if adapter_name in self.lora_A.keys(): + # initialize A the same way as the default for nn.Linear and B to zero + self.lora_A[adapter_name].weight.data = lora_A + self.lora_B[adapter_name].weight.data = lora_B + if adapter_name in self.lora_embedding_A.keys(): + # initialize a the same way as the default for nn.linear and b to zero + self.lora_embedding_A[adapter_name].weight.data = lora_A + self.lora_embedding_B[adapter_name].weight.data = lora_B + self.get_base_layer().weight.data = qweight + + def _get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor: + # calculate L2 norm of weight matrix, column-wise + weight = weight + scaling * lora_weight + weight_norm = torch.linalg.norm(weight, dim=1) + return weight_norm + + def dora_init(self, adapter_name: str) -> None: + lora_A = self.lora_A[adapter_name] + lora_B = self.lora_B[adapter_name] + scaling = self.scaling[adapter_name] + with gather_params_ctx(self.get_base_layer()): + weight = self.get_base_layer().weight + lora_weight = lora_B.weight @ lora_A.weight + weight_norm = self._get_weight_norm(weight, lora_weight, scaling) + self.lora_magnitude_vector = nn.ParameterDict() + self.lora_magnitude_vector[adapter_name] = nn.Parameter(weight_norm, requires_grad=True) + # add lora_magnitude_vector to the list of learnable parameters + self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",) + + def _cache_store(self, key: str, value: Any) -> None: + self._caches[key] = value + + def _cache_pop(self, key: str) -> Any: + value = self._caches.pop(key) + return value + + def _apply_mora(self, x, lora_A, lora_B, scaling, active_adapter): + in_f, out_f = self.in_features, self.out_features + r = self.r[active_adapter] + if active_adapter in self.mora_type: + mora_type = self.mora_type[active_adapter] + else: + mora_type = 1 + + if mora_type == 1 or mora_type == 4: + sum_inter = in_f // r + if in_f % r != 0: + pad_size = r - in_f % r + # x = torch.cat([x, torch.zeros_like(x)[..., :pad_size]], dim=-1) + x = torch.cat([x, x[..., :pad_size]], dim=-1) + sum_inter += 1 + in_x = x.view(*x.shape[:-1], sum_inter, r).sum(dim=-2) + elif mora_type == 2 or mora_type == 3: + mr, nr = in_f//r+1, in_f//r + m, n = in_f - r*nr, r*mr - in_f + mm, nn = m*mr, n * nr + if m > 0: + x_m, x_n = x[..., :mm], x[..., mm:] + x_m = x_m.view(*x.shape[:-1], m, mr).sum(dim=-1) + x_n = x_n.view(*x.shape[:-1], n, nr).sum(dim=-1) + in_x = torch.cat([x_m, x_n ], dim=-1) + else: + in_x = x.view(*x.shape[:-1], n, nr).sum(dim=-1) + elif mora_type == 6: + sum_inter = in_f // r + rb1 = in_f//r if in_f % r == 0 else in_f//r + 1 + if in_f % r != 0: + pad_size = r - in_f % r + x = torch.cat([x, x[..., :pad_size]], dim=-1) + sum_inter += 1 + in_x = x.view(*x.shape[:-1], sum_inter, r) + if not hasattr(self, 'cos') and not hasattr(self, 'sin'): + inv_freq = 1.0 / (10000 ** (torch.arange(0, r, 2).float() / r)) + t = torch.arange(rb1) + freqs = torch.outer(t, inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + self.cos = emb.cos().unsqueeze(0).to(x.device).to(x.dtype) + self.sin = emb.sin().unsqueeze(0).to(x.device).to(x.dtype) + rh_in_x = torch.cat((-in_x[..., r//2:], in_x[..., :r//2]), dim=-1) + in_x = in_x*self.cos + rh_in_x*self.sin + + + out_x = lora_A(in_x) + + if mora_type == 1 or mora_type == 3: + repeat_time = out_f // r + if out_f % r != 0: + repeat_time += 1 + out_x = torch.cat([out_x]*repeat_time, dim=-1)[..., :out_f] + elif mora_type == 2 or mora_type == 4: + mr, nr = out_f//r+1, out_f//r + m, n = out_f - r*nr, r*mr - out_f + mm, nn = m*mr, n * nr + if m > 0: + out_x = torch.cat([torch.repeat_interleave(out_x[..., :m], mr, dim=-1), + torch.repeat_interleave(out_x[..., m:], nr, dim=-1)] + , dim=-1) + else: + out_x = torch.repeat_interleave(out_x, nr, dim=-1) + elif mora_type == 6: + out_x = out_x.view(*x.shape[:-1], -1)[..., :out_f] + if out_x.shape[-1] < out_f: + repeat_time = out_f // out_x.shape[-1] + if out_f % out_x.shape[-1] != 0: + repeat_time += 1 + out_x = torch.cat([out_x]*repeat_time, dim=-1)[..., :out_f] + + return out_x + + def _apply_dora(self, x, lora_A, lora_B, scaling, active_adapter): + """ + For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer + output. + """ + lora_weight = lora_B.weight @ lora_A.weight + magnitude = self.lora_magnitude_vector[active_adapter] + weight = self.get_base_layer().weight + weight_norm = self._get_weight_norm(weight, lora_weight, scaling) + # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353) + # "[...] we suggest treating ||V +βˆ†V ||_c in + # Eq. (5) as a constant, thereby detaching it from the gradient + # graph. This means that while ||V + βˆ†V ||_c dynamically + # reflects the updates of βˆ†V , it won’t receive any gradient + # during backpropagation" + weight_norm = weight_norm.detach() + mag_norm_scale = (magnitude / weight_norm).view(1, -1) + result_dora = (mag_norm_scale - 1) * ( + F.linear(x, transpose(weight, self.fan_in_fan_out)) + ) + mag_norm_scale * lora_B(lora_A(x)) * scaling + + # Note: Computation could potentially be accelerated by using the code below instead of calculating X@W again. + # This is only correct if dropout=0, otherwise results will differ: + # https://github.com/huggingface/peft/pull/1474#issuecomment-1964682771 + # bias = self.get_base_layer().bias + # if bias is not None: + # result = result - bias + # result = mag_norm_scale * result + mag_norm_scale * lora_B(lora_A(x)) * scaling + # if bias is not None: + # result = result + bias + + return result_dora + + def set_scale(self, adapter, scale): + if adapter not in self.scaling: + # Ignore the case where the adapter is not in the layer + return + self.scaling[adapter] = scale * self.lora_alpha[adapter] / self.r[adapter] + + def scale_layer(self, scale: float) -> None: + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + + self.scaling[active_adapter] *= scale + + def unscale_layer(self, scale=None) -> None: + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + + if scale is None: + self.scaling[active_adapter] = self.lora_alpha[active_adapter] / self.r[active_adapter] + else: + self.scaling[active_adapter] /= scale + + +# Below code is based on https://github.com/microsoft/LoRA/blob/main/loralib/layers.py +# and modified to work with PyTorch FSDP + + +# ------------------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +# ------------------------------------------------------------------------------------------ + + +class Linear(nn.Module, LoraLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_target_conv_1d_layer: bool = False, + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + use_mora: bool = False, + mora_type: int = 1, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + use_mora=use_mora, + mora_type=mora_type, + ) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.lora_A.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + delta_weight = self.get_delta_weight(active_adapter) + if not self.use_dora[active_adapter]: + orig_weights += delta_weight + else: + # handle dora + # since delta_weight already includes scaling, set it to 1 here + weight_norm = self._get_weight_norm(orig_weights, delta_weight, scaling=1).detach() + # We need to cache weight_norm because it has to be based on the original weights. We + # cannot calculate it on the fly based on the merged weights when unmerging because its a + # different value + self._cache_store(f"{active_adapter}-weight_norm", weight_norm) + dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm + orig_weights = dora_factor.view(-1, 1) * (orig_weights + delta_weight) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + delta_weight = self.get_delta_weight(active_adapter) + if not self.use_dora[active_adapter]: + base_layer.weight.data += delta_weight + else: + # handle dora + # since delta_weight already includes scaling, set it to 1 here + weight_norm = self._get_weight_norm(base_layer.weight, delta_weight, scaling=1).detach() + # We need to cache weight_norm because it has to be based on the original weights. We + # cannot calculate it on the fly based on the merged weights when unmerging because its a + # different value + self._cache_store(f"{active_adapter}-weight_norm", weight_norm) + dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm + new_weight = dora_factor.view(-1, 1) * (base_layer.weight.data + delta_weight) + base_layer.weight.data = new_weight + + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_A.keys(): + weight = self.get_base_layer().weight + delta_weight = self.get_delta_weight(active_adapter) + if not self.use_dora[active_adapter]: + weight.data -= delta_weight + else: + weight_norm = self._cache_pop(f"{active_adapter}-weight_norm") + dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm + weight_orig = weight.data / dora_factor.view(-1, 1) - delta_weight + weight.data = weight_orig + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.lora_B[adapter].weight.device + dtype = self.lora_B[adapter].weight.dtype + + # In case users wants to merge the adapter weights that are in + # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16. + cast_to_fp32 = device.type == "cpu" and dtype == torch.float16 + + weight_A = self.lora_A[adapter].weight + weight_B = self.lora_B[adapter].weight + + if cast_to_fp32: + weight_A = weight_A.float() + weight_B = weight_B.float() + + if self.use_mora[adapter]: + in_f, out_f = self.in_features, self.out_features + r = self.r[adapter] + if in_f % r != 0: + pad_size = r - in_f % r + else: + pad_size = 0 + repeat_time = out_f // r + if out_f % r != 0: + repeat_time += 1 + + if adapter not in self.mora_type or self.mora_type[adapter] == 1: + w = torch.zeros(r, in_f).to(device, dtype=dtype) + aw = weight_A + for i in range(in_f + pad_size): + w[:, i % in_f] += aw[:, i % r] + w = torch.cat([w]*repeat_time, dim=0)[:out_f] + elif self.mora_type[adapter] == 2: + w = weight_A + mr, nr = in_f//r+1, in_f//r + m, n = in_f - r*nr, r*mr - in_f + + mm, nn = m*mr, n * nr + w = torch.cat([torch.repeat_interleave(w[:, :m], mr, dim=1), + torch.repeat_interleave(w[:, m:], nr, dim=1)], dim=1) + + mr, nr = out_f//r+1, out_f//r + m, n = out_f - r*nr, r*mr - out_f + mm, nn = m*mr, n * nr + w = torch.cat([torch.repeat_interleave(w[:m], mr, dim=0), + torch.repeat_interleave(w[m:], nr, dim=0)], dim=0) + elif self.mora_type[adapter] == 3: + w = weight_A + mr, nr = in_f//r+1, in_f//r + m, n = in_f - r*nr, r*mr - in_f + mm, nn = m*mr, n * nr + w = torch.cat([torch.repeat_interleave(w[:, :m], mr, dim=1), + torch.repeat_interleave(w[:, m:], nr, dim=1)], dim=1) + + w = torch.cat([w]*repeat_time, dim=0)[:out_f] + elif self.mora_type[adapter] == 4: + w = torch.zeros(r, in_f).to(device, dtype=dtype) + aw = weight_A + for i in range(in_f + pad_size): + w[:, i % in_f] += aw[:, i % r] + + mr, nr = out_f//r+1, out_f//r + m, n = out_f - r*nr, r*mr - out_f + mm, nn = m*mr, n * nr + w = torch.cat([torch.repeat_interleave(w[:m], mr, dim=0), + torch.repeat_interleave(w[m:], nr, dim=0)], dim=0) + elif self.mora_type[adapter] == 6: + w = torch.zeros(in_f+pad_size, in_f).to(device, dtype=dtype) + rb1 = in_f//r if in_f % r == 0 else in_f//r + 1 + rb2 = out_f//r if out_f % r == 0 else out_f//r + 1 + sum_inter, repeat_time = rb1, rb2 + if not hasattr(self, 'cos') and not hasattr(self, 'sin'): + inv_freq = 1.0 / (10000 ** (torch.arange(0, r, 2).float() / r)) + t = torch.arange(rb1) + freqs = torch.outer(t, inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + self.cos = emb.cos().unsqueeze(0).to(w.device).to(w.dtype) + self.sin = emb.sin().unsqueeze(0).to(w.device).to(w.dtype) + cos, sin = self.cos, self.sin + aw = weight_A + aw2 = torch.cat((aw[:, r//2:], -aw[:, :r//2]), dim=-1) + for i in range(sum_inter-1): + w[i*r:(i+1)*r, i*r:(i+1)*r] = aw2*sin[:, i] + aw*cos[:, i] + i+=1 + w[i*r:, i*r:] = (aw2*sin[:, i] + aw*cos[:, i])[:, :r-pad_size] #+ aw2*sin[:, i])[:, :r-pad_size] + if pad_size > 0: + w[i*r:, :pad_size] = (aw2*sin[:, i] + aw*cos[:, i])[:, r-pad_size:] + if in_f < out_f: + w = torch.cat([w]*repeat_time, dim=0)[:out_f] + else: + w = w[:out_f] + else: + # old + w = torch.zeros(r, in_f).to(device, dtype=dtype) + aw = weight_A + for i in range(in_f): + w[:, i % in_f] += aw[:, i % r] + #w = torch.cat([w]*repeat_time, dim=0)[:out_f] + w = torch.cat([torch.repeat_interleave(w, out_f//r, dim=0), w], dim=0)[:out_f] + output_tensor = w + else: + output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter] + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.lora_A[adapter].weight.data = weight_A.to(dtype) + self.lora_B[adapter].weight.data = weight_B.to(dtype) + + # print rank of output_tensor + # print(f'rank: {torch.linalg.matrix_rank(output_tensor.float())}') + return output_tensor + + # use_mora_merge_ft = False + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + # elif hasattr(self, 'use_mora_merge_ft') and self.use_mora_merge_ft: + # print('use_mora_merge_ft') + # active_adapter = self.active_adapters[0] + # ow = self.base_layer.weight.clone() + # in_f, out_f = self.in_features, self.out_features + # r = self.r[active_adapter] + # pad_size = r - in_f % r if in_f % r != 0 else 0 + # repeat_time = out_f // r + # if out_f % r != 0: repeat_time += 1 + # aw = self.lora_A[active_adapter].weight + # w = torch.zeros(r, in_f).to(ow.device, dtype=ow.dtype) + # for i in range(in_f + pad_size): + # w[:, i % in_f] += aw[:, i % r] + # w = torch.cat([w]*repeat_time, dim=0)[:out_f] + # result = F.linear(x, ow+w, self.base_layer.bias) + else: + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + x = x.to(lora_A.weight.dtype) + + if self.use_mora[active_adapter]: + # x = dropout(x) + # delta = self._apply_mora(x, lora_A, lora_B, scaling, active_adapter) + # print(delta.abs().mean().item()) + # with open('mora.txt', 'w') as f: + # print(delta.abs().mean().item(), file=f) + # result = result + delta + + x = dropout(x) + result = result + self._apply_mora(x, lora_A, lora_B, scaling, active_adapter) + elif not self.use_dora[active_adapter]: + # delta = lora_B(lora_A(dropout(x))) * scaling + # print(delta.abs().mean().item()) + # with open('lora.txt', 'w') as f: + # print(delta.abs().mean().item(), file=f) + # result = result + delta + + result = result + lora_B(lora_A(dropout(x))) * scaling + else: + x = dropout(x) + result = result + self._apply_dora(x, lora_A, lora_B, scaling, active_adapter) + + result = result.to(torch_result_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +class Embedding(nn.Module, LoraLayer): + # LoRA implemented in a Embedding layer + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer) + + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + ) + + def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora): + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout[adapter_name] = lora_dropout_layer + # Actual trainable parameters + weight_A = torch.randn((r, self.in_features)) + weight_B = torch.randn((self.out_features, r)) + self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A) + self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B) + if use_rslora: + self.scaling[adapter_name] = lora_alpha / math.sqrt(r) + else: + self.scaling[adapter_name] = lora_alpha / r + + if init_lora_weights == "loftq": + self.loftq_init(adapter_name) + elif init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + + base_layer = self.get_base_layer() + weight = getattr(base_layer, "weight", None) + if weight is not None: + # the layer is already completely initialized, this is an update + self.to(base_layer.weight.device, dtype=weight.dtype) + self.set_adapter(self.active_adapters) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.lora_embedding_A.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_embedding_A.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.lora_embedding_B[adapter].device + dtype = self.lora_embedding_A[adapter].dtype + + # In case users wants to merge the adapter weights that are in + # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16. + cast_to_fp32 = device.type == "cpu" and dtype == torch.float16 + + weight_A = self.lora_embedding_A[adapter] + weight_B = self.lora_embedding_B[adapter] + + if cast_to_fp32: + weight_A = weight_A.float() + weight_B = weight_B.float() + + output_tensor = transpose(weight_B @ weight_A, True) * self.scaling[adapter] + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.lora_embedding_A[adapter] = weight_A.to(dtype) + self.lora_embedding_B[adapter] = weight_B.to(dtype) + + return output_tensor + + def _embed(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: + base_layer = self.get_base_layer() + return F.embedding( + input, + weight, + padding_idx=base_layer.padding_idx, + max_norm=base_layer.max_norm, + norm_type=base_layer.norm_type, + scale_grad_by_freq=base_layer.scale_grad_by_freq, + sparse=base_layer.sparse, + ) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + # TODO: no dtype conversion here, unlike in Linear, is that correct? + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_embedding_A: + continue + embedding_A = self.lora_embedding_A[active_adapter].T + embedding_B = self.lora_embedding_B[active_adapter].T + scaling = self.scaling[active_adapter] + after_A = self._embed(x, embedding_A) + result += (after_A @ embedding_B) * scaling + result = result.to(torch_result_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +class Conv2d(nn.Module, LoraLayer): + # Lora implemented in a conv2d layer + def __init__( + self, + base_layer: nn.Module, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + **kwargs, + ) -> None: + super().__init__() + LoraLayer.__init__(self, base_layer) + + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + ) + + def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora): + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout[adapter_name] = lora_dropout_layer + # Actual trainable parameters + base_layer = self.get_base_layer() + kernel_size = base_layer.kernel_size + stride = base_layer.stride + padding = base_layer.padding + self.lora_A[adapter_name] = nn.Conv2d(self.in_features, r, kernel_size, stride, padding, bias=False) + self.lora_B[adapter_name] = nn.Conv2d(r, self.out_features, (1, 1), (1, 1), bias=False) + if use_rslora: + self.scaling[adapter_name] = lora_alpha / math.sqrt(r) + else: + self.scaling[adapter_name] = lora_alpha / r + + if init_lora_weights == "loftq": + self.loftq_init(adapter_name) + elif init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + + weight = getattr(base_layer, "weight", None) + if weight is not None: + # the layer is already completely initialized, this is an update + self.to(base_layer.weight.device, dtype=weight.dtype) + self.set_adapter(self.active_adapters) + + def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None: + """ + Merge the active adapter weights inside the base weights + + Args: + safe_merge (`bool`, *optional*): + If True, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self.lora_A.keys(): + base_layer = self.get_base_layer() + if safe_merge: + # Note that safe_merge will be slower than the normal merge + # because of the copy operation. + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self.lora_A.keys(): + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.lora_B[adapter].weight.device + dtype = self.lora_A[adapter].weight.dtype + + # In case users wants to merge the adapter weights that are in + # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16. + cast_to_fp32 = device.type == "cpu" and dtype == torch.float16 + + weight_A = self.lora_A[adapter].weight + weight_B = self.lora_B[adapter].weight + + if cast_to_fp32: + weight_A = weight_A.float() + weight_B = weight_B.float() + + # https://github.com/bmaltais/kohya_ss/blob/feb6728762a8f463d15ba936d189d4c3abfaa1ab/networks/lora.py#L117 + if self.get_base_layer().weight.size()[2:4] == (1, 1): + # conv2d 1x1 + output_tensor = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze( + 3 + ) * self.scaling[adapter] + else: + # conv2d 3x3 + output_tensor = ( + F.conv2d( + weight_A.permute(1, 0, 2, 3), + weight_B, + ).permute(1, 0, 2, 3) + * self.scaling[adapter] + ) + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.lora_A[adapter].weight.data = weight_A.to(dtype) + self.lora_B[adapter].weight.data = weight_B.to(dtype) + + return output_tensor + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + x = x.to(lora_A.weight.dtype) + result += lora_B(lora_A(dropout(x))) * scaling + + result = result.to(torch_result_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + + +def dispatch_default( + target: torch.nn.Module, + adapter_name: str, + lora_config: LoraConfig, + **kwargs, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Embedding): + embedding_kwargs = kwargs.copy() + embedding_kwargs.pop("fan_in_fan_out", None) + embedding_kwargs.update(lora_config.loftq_config) + new_module = Embedding(target, adapter_name, **embedding_kwargs) + elif isinstance(target_base_layer, torch.nn.Conv2d): + kwargs.update(lora_config.loftq_config) + new_module = Conv2d(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False + kwargs.update(lora_config.loftq_config) + new_module = Linear(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, Conv1D): + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. " "Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True + kwargs.update(lora_config.loftq_config) + new_module = Linear(target, adapter_name, is_target_conv_1d_layer=True, **kwargs) + + return new_module diff --git a/MoRA/peft_mora/tuners/lora/model.py b/MoRA/peft_mora/tuners/lora/model.py new file mode 100644 index 0000000000000000000000000000000000000000..179f4a8b1b49ae3b04ba30dba000176f6ad736ba --- /dev/null +++ b/MoRA/peft_mora/tuners/lora/model.py @@ -0,0 +1,730 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import math +import operator +import re +import warnings +from dataclasses import asdict, replace +from enum import Enum +from functools import reduce +from itertools import chain +from typing import Literal, Optional + +import torch +from torch import nn +from tqdm import tqdm + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available +from peft_mora.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists, onload_layer +from peft_mora.utils import ( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + ModulesToSaveWrapper, + _freeze_adapter, + _get_submodules, + get_quantization_config, +) +from peft_mora.utils.merge_utils import dare_linear, dare_ties, magnitude_prune, task_arithmetic, ties + +from .aqlm import dispatch_aqlm +from .awq import dispatch_awq +from .config import LoraConfig +from .gptq import dispatch_gptq +from .layer import Conv2d, LoraLayer, dispatch_default +from .tp_layer import dispatch_megatron + + +class LoraModel(BaseTuner): + """ + Creates Low Rank Adapter (LoRA) model from a pretrained transformers model. + + The method is described in detail in https://arxiv.org/abs/2106.09685. + + Args: + model ([`torch.nn.Module`]): The model to be adapted. + config ([`LoraConfig`]): The configuration of the Lora model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + + Returns: + `torch.nn.Module`: The Lora model. + + Example: + + ```py + >>> from transformers import AutoModelForSeq2SeqLM + >>> from peft import LoraModel, LoraConfig + + >>> config = LoraConfig( + ... task_type="SEQ_2_SEQ_LM", + ... r=8, + ... lora_alpha=32, + ... target_modules=["q", "v"], + ... lora_dropout=0.01, + ... ) + + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> lora_model = LoraModel(model, config, "default") + ``` + + ```py + >>> import transformers + >>> from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_int8_training + + >>> target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"] + >>> config = LoraConfig( + ... r=4, lora_alpha=16, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" + ... ) + + >>> model = transformers.GPTJForCausalLM.from_pretrained( + ... "kakaobrain/kogpt", + ... revision="KoGPT6B-ryan1.5b-float16", # or float32 version: revision=KoGPT6B-ryan1.5b + ... pad_token_id=tokenizer.eos_token_id, + ... use_cache=False, + ... device_map={"": rank}, + ... torch_dtype=torch.float16, + ... load_in_8bit=True, + ... ) + >>> model = prepare_model_for_int8_training(model) + >>> lora_model = get_peft_model(model, config) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`LoraConfig`]): The configuration of the Lora model. + """ + + prefix: str = "lora_" + + def __init__(self, model, config, adapter_name) -> None: + super().__init__(model, config, adapter_name) + + def _check_new_adapter_config(self, config: LoraConfig) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check + # does not fully correspond to the error message. + if (len(self.peft_config) > 1) and (config.bias != "none"): + raise ValueError( + f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, " + "set bias to 'none' for all adapters." + ) + + @staticmethod + def _check_target_module_exists(lora_config, key): + return check_target_module_exists(lora_config, key) + + def _create_and_replace( + self, + lora_config, + adapter_name, + target, + target_name, + parent, + current_key, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + # Regexp matching - Find key which matches current target_name in patterns provided + pattern_keys = list(chain(lora_config.rank_pattern.keys(), lora_config.alpha_pattern.keys())) + target_name_key = next(filter(lambda key: re.match(rf".*\.{key}$", current_key), pattern_keys), current_key) + r = lora_config.rank_pattern.get(target_name_key, lora_config.r) + alpha = lora_config.alpha_pattern.get(target_name_key, lora_config.lora_alpha) + + + kwargs = { + "r": r, + "lora_alpha": alpha, + "lora_dropout": lora_config.lora_dropout, + "fan_in_fan_out": lora_config.fan_in_fan_out, + "init_lora_weights": lora_config.init_lora_weights, + "use_rslora": lora_config.use_rslora, + "use_dora": lora_config.use_dora, + "use_mora": lora_config.use_mora, + "mora_type": lora_config.mora_type, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + + + use_mora = lora_config.use_mora + + quant_methods = ["gptq", "aqlm", "awq"] + for quant_method in quant_methods: + quantization_config = get_quantization_config(self.model, method=quant_method) + if quantization_config is not None: + kwargs[f"{quant_method}_quantization_config"] = quantization_config + + # note: AdaLoraLayer is a subclass of LoraLayer, we need to exclude it + from peft_mora.tuners.adalora import AdaLoraLayer + + if isinstance(target, LoraLayer) and not isinstance(target, AdaLoraLayer): + target.update_layer( + adapter_name, + r, + lora_alpha=alpha, + lora_dropout=lora_config.lora_dropout, + init_lora_weights=lora_config.init_lora_weights, + use_rslora=lora_config.use_rslora, + use_dora=lora_config.use_dora, + use_mora=use_mora, + mora_type=lora_config.mora_type, + ) + else: + new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs) + if adapter_name != self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + def _replace_module(self, parent, child_name, new_module, child): + setattr(parent, child_name, new_module) + # It's not necessary to set requires_grad here, as that is handled by + # _mark_only_adapters_as_trainable + + # child layer wraps the original module, unpack it + if hasattr(child, "base_layer"): + child = child.base_layer + + if not hasattr(new_module, "base_layer"): + new_module.weight = child.weight + if hasattr(child, "bias"): + new_module.bias = child.bias + + if getattr(child, "state", None) is not None: + if hasattr(new_module, "base_layer"): + new_module.base_layer.state = child.state + else: + new_module.state = child.state + new_module.to(child.weight.device) + + # dispatch to correct device + for name, module in new_module.named_modules(): + if (self.prefix in name) or ("ranknum" in name): + weight = child.qweight if hasattr(child, "qweight") else child.weight + module.to(weight.device) + + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + for n, p in model.named_parameters(): + if self.prefix not in n: + p.requires_grad = False + + for active_adapter in self.active_adapters: + bias = self.peft_config[active_adapter].bias + if bias == "none": + continue + + if bias == "all": + for n, p in model.named_parameters(): + if "bias" in n: + p.requires_grad = True + elif bias == "lora_only": + for m in model.modules(): + if isinstance(m, LoraLayer) and hasattr(m, "bias") and m.bias is not None: + m.bias.requires_grad = True + else: + raise NotImplementedError(f"Requested bias: {bias}, is not implemented.") + + @staticmethod + def _create_new_module(lora_config, adapter_name, target, **kwargs): + # Collect dispatcher functions to decide what backend to use for the replaced LoRA layer. The order matters, + # because the first match is always used. Therefore, the default layers should be checked last. + dispatchers = [] + + # avoid eager bnb import + if is_bnb_available(): + from .bnb import dispatch_bnb_8bit + + dispatchers.append(dispatch_bnb_8bit) + + if is_bnb_4bit_available(): + from .bnb import dispatch_bnb_4bit + + dispatchers.append(dispatch_bnb_4bit) + + dispatchers.extend([dispatch_aqlm, dispatch_awq, dispatch_gptq, dispatch_megatron, dispatch_default]) + + new_module = None + for dispatcher in dispatchers: + new_module = dispatcher(target, adapter_name, lora_config=lora_config, **kwargs) + if new_module is not None: # first match wins + break + + if new_module is None: + # no module could be matched + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`." + ) + + return new_module + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.model, name) + + def get_peft_config_as_dict(self, inference: bool = False): + config_dict = {} + for key, value in self.peft_config.items(): + config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} + if inference: + config["inference_mode"] = True + config_dict[key] = config + return config + + def _set_adapter_layers(self, enabled: bool = True) -> None: + for module in self.model.modules(): + if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)): + module.enable_adapters(enabled) + + def enable_adapter_layers(self) -> None: + """Enable all adapters. + + Call this if you have previously disabled all adapters and want to re-enable them. + """ + self._set_adapter_layers(enabled=True) + + def disable_adapter_layers(self) -> None: + """Disable all adapters. + + When disabling all adapters, the model output corresponds to the output of the base model. + """ + for active_adapter in self.active_adapters: + val = self.peft_config[active_adapter].bias + if val != "none": + msg = ( + f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same " + "output as the the base model would without adaption." + ) + warnings.warn(msg) + self._set_adapter_layers(enabled=False) + + def set_adapter(self, adapter_name: str | list[str]) -> None: + """Set the active adapter(s). + + Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated. + """ + for module in self.model.modules(): + if isinstance(module, LoraLayer): + if module.merged: + warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") + module.unmerge() + module.set_adapter(adapter_name) + self.active_adapter = adapter_name + + @staticmethod + def _prepare_adapter_config(peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING: + raise ValueError("Please specify `target_modules` in `peft_config`") + peft_config.target_modules = set( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]] + ) + return peft_config + + def _unload_and_optionally_merge( + self, + merge=True, + progressbar: bool = False, + safe_merge: bool = False, + adapter_names: Optional[list[str]] = None, + ): + if merge: + if getattr(self.model, "quantization_method", None) == "gptq": + raise ValueError("Cannot merge LORA layers when the model is gptq quantized") + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + desc = "Unloading " + ("and merging " if merge else "") + "model" + for key in tqdm(key_list, disable=not progressbar, desc=desc): + try: + parent, target, target_name = _get_submodules(self.model, key) + except AttributeError: + continue + with onload_layer(target): + if hasattr(target, "base_layer"): + if merge: + target.merge(safe_merge=safe_merge, adapter_names=adapter_names) + self._replace_module(parent, target_name, target.get_base_layer(), target) + elif isinstance(target, ModulesToSaveWrapper): + # save any additional trainable modules part of `modules_to_save` + new_module = target.modules_to_save[target.active_adapter] + if hasattr(new_module, "base_layer"): + # check if the module is itself a tuner layer + if merge: + new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names) + new_module = new_module.get_base_layer() + setattr(parent, target_name, new_module) + + return self.model + + def add_weighted_adapter( + self, + adapters, + weights, + adapter_name, + combination_type="svd", + svd_rank=None, + svd_clamp=None, + svd_full_matrices=True, + svd_driver=None, + density=None, + majority_sign_method: Literal["total", "frequency"] = "total", + ) -> None: + """ + This method adds a new adapter by merging the given adapters with the given weights. + + When using the `cat` combination_type you should be aware that rank of the resulting adapter will be equal to + the sum of all adapters ranks. So it's possible that the mixed adapter may become too big and result in OOM + errors. + + Args: + adapters (`list`): + List of adapter names to be merged. + weights (`list`): + List of weights for each adapter. + adapter_name (`str`): + Name of the new adapter. + combination_type (`str`): + The merging type can be one of [`svd`, `linear`, `cat`, `ties`, `ties_svd`, `dare_ties`, `dare_linear`, + `dare_ties_svd`, `dare_linear_svd`, `magnitude_prune`, `magnitude_prune_svd`]. When using the `cat` + combination_type, the rank of the resulting adapter is equal to the sum of all adapters ranks (the + mixed adapter may be too big and result in OOM errors). + svd_rank (`int`, *optional*): + Rank of output adapter for svd. If None provided, will use max rank of merging adapters. + svd_clamp (`float`, *optional*): + A quantile threshold for clamping SVD decomposition output. If None is provided, do not perform + clamping. Defaults to None. + svd_full_matrices (`bool`, *optional*): + Controls whether to compute the full or reduced SVD, and consequently, the shape of the returned + tensors U and Vh. Defaults to True. + svd_driver (`str`, *optional*): + Name of the cuSOLVER method to be used. This keyword argument only works when merging on CUDA. Can be + one of [None, `gesvd`, `gesvdj`, `gesvda`]. For more info please refer to `torch.linalg.svd` + documentation. Defaults to None. + density (`float`, *optional*): + Value between 0 and 1. 0 means all values are pruned and 1 means no values are pruned. Should be used + with [`ties`, `ties_svd`, `dare_ties`, `dare_linear`, `dare_ties_svd`, `dare_linear_svd`, + `magnintude_prune`, `magnitude_prune_svd`] + majority_sign_method (`str`): + The method, should be one of ["total", "frequency"], to use to get the magnitude of the sign values. + Should be used with [`ties`, `ties_svd`, `dare_ties`, `dare_ties_svd`] + """ + + if adapter_name in list(self.peft_config.keys()): + return + for adapter in adapters: + if adapter not in list(self.peft_config.keys()): + raise ValueError(f"Adapter {adapter} does not exist") + + # if there is only one adapter, we can only use linear merging + combination_type = "linear" if len(adapters) == 1 else combination_type + + adapters_ranks = [self.peft_config[adapter].r for adapter in adapters] + if combination_type in ("linear", "ties", "dare_ties", "dare_linear", "magnitude_prune"): + # all adapters ranks should be same, new rank is just this value + if len(set(adapters_ranks)) != 1: + raise ValueError( + "All adapters must have the same r value when using combination_type linear, ties, dare_ties or dare_linear." + ) + new_rank = adapters_ranks[0] + elif combination_type == "cat": + # adapters ranks may be different, new rank is sum of all ranks + # be careful, because output adapter rank may be really big if mixing a lot of adapters + new_rank = sum(adapters_ranks) + elif combination_type.endswith("svd"): + # new rank is the max of all ranks of the adapters if not provided + new_rank = svd_rank or max(adapters_ranks) + else: + raise ValueError(f"Invalid combination_type: {combination_type}") + + target_module_types = [type(self.peft_config[adapter].target_modules) for adapter in adapters] + if not target_module_types: + raise ValueError(f"Found no adapter matching the names in {adapters}") + if len(set(target_module_types)) > 1: + raise ValueError( + "all adapter configs should follow the same target modules type. " + "Combining adapters with `target_modules` type being a mix of list/set and string is not supported." + ) + + if target_module_types[0] == str: + new_target_modules = "|".join(f"({self.peft_config[adapter].target_modules})" for adapter in adapters) + elif target_module_types[0] == set: + new_target_modules = reduce( + operator.or_, (self.peft_config[adapter].target_modules for adapter in adapters) + ) + else: + raise TypeError(f"Invalid type {target_module_types[0]} found in target_modules") + + self.peft_config[adapter_name] = replace( + self.peft_config[adapters[0]], + r=new_rank, + lora_alpha=new_rank, + target_modules=new_target_modules, + ) + self.inject_adapter(self.model, adapter_name) + + # Do we really need that? + _freeze_adapter(self.model, adapter_name) + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, LoraLayer): + if adapter_name in target.lora_A: + target_lora_A = target.lora_A[adapter_name].weight + target_lora_B = target.lora_B[adapter_name].weight + elif adapter_name in target.lora_embedding_A: + target_lora_A = target.lora_embedding_A[adapter_name] + target_lora_B = target.lora_embedding_B[adapter_name] + else: + continue + + target_lora_A.data = target_lora_A.data * 0.0 + target_lora_B.data = target_lora_B.data * 0.0 + if combination_type == "cat": + loras_A, loras_B = [], [] + for adapter, weight in zip(adapters, weights): + if adapter in target.lora_A: + current_adapter_lora_A = target.lora_A[adapter].weight + current_adapter_lora_B = target.lora_B[adapter].weight + elif adapter in target.lora_embedding_A: + current_adapter_lora_A = target.lora_embedding_A[adapter] + current_adapter_lora_B = target.lora_embedding_B[adapter] + else: + continue + loras_A.append(current_adapter_lora_A.data * weight * target.scaling[adapter]) + loras_B.append(current_adapter_lora_B.data) + + if len(loras_A) == 0: + raise ValueError("No matching LoRAs found. Please raise an issue on GitHub.") + loras_A = torch.cat(loras_A, dim=0) + loras_B = torch.cat(loras_B, dim=1) + target_lora_A.data[: loras_A.shape[0], :] = loras_A + target_lora_B.data[:, : loras_B.shape[1]] = loras_B + elif combination_type in [ + "svd", + "ties_svd", + "dare_linear_svd", + "dare_ties_svd", + "magnitude_prune_svd", + ]: + target_lora_A.data, target_lora_B.data = self._svd_generalized_task_arithmetic_weighted_adapter( + combination_type, + adapters, + weights, + new_rank, + target, + target_lora_A, + target_lora_B, + density, + majority_sign_method, + svd_clamp, + full_matrices=svd_full_matrices, + driver=svd_driver, + ) + elif combination_type in ["linear", "ties", "dare_linear", "dare_ties", "magnitude_prune"]: + target_lora_A.data, target_lora_B.data = self._generalized_task_arithmetic_weighted_adapter( + combination_type, adapters, weights, target, density, majority_sign_method + ) + + def _svd_generalized_task_arithmetic_weighted_adapter( + self, + combination_type, + adapters, + weights, + new_rank, + target, + target_lora_A, + target_lora_B, + density, + majority_sign_method, + clamp=None, + full_matrices=True, + driver=None, + ): + valid_adapters = [] + valid_weights = [] + is_embedding = any(adapter in target.lora_embedding_A for adapter in adapters) + for adapter, weight in zip(adapters, weights): + if adapter in target.lora_A or adapter in target.lora_embedding_A: + valid_adapters.append(adapter) + valid_weights.append(weight * target.scaling[adapter]) + + # if no valid adapter, nothing to do + if len(valid_adapters) == 0: + raise ValueError("No matching LoRAs found. Please raise an issue on Github.") + delta_weight = [target.get_delta_weight(adapter) for adapter in valid_adapters] + valid_weights = torch.tensor(valid_weights).to(delta_weight[0].device) + if combination_type == "svd": + delta_weight = task_arithmetic(delta_weight, valid_weights) + elif combination_type == "ties_svd": + delta_weight = ties(delta_weight, valid_weights, density, majority_sign_method) + elif combination_type == "dare_linear_svd": + delta_weight = dare_linear(delta_weight, valid_weights, density) + elif combination_type == "dare_ties_svd": + delta_weight = dare_ties(delta_weight, valid_weights, density, majority_sign_method) + elif combination_type == "magnitude_prune_svd": + delta_weight = magnitude_prune(delta_weight, valid_weights, density) + else: + raise ValueError(f"Invalid value passed to combination type: {combination_type}") + + conv2d = isinstance(target, Conv2d) + if conv2d: + conv2d_1x1 = target.weight.size()[2:4] == (1, 1) + if not conv2d_1x1: + delta_weight = delta_weight.flatten(start_dim=1) + else: + delta_weight = delta_weight.squeeze() + if (hasattr(target, "fan_in_fan_out") and target.fan_in_fan_out) or is_embedding: + delta_weight = delta_weight.T + + # based on https://github.com/kohya-ss/sd-scripts/blob/main/networks/svd_merge_lora.py#L114-L131 + U, S, Vh = torch.linalg.svd(delta_weight, full_matrices=full_matrices, driver=driver) + U = U[:, :new_rank] + S = S[:new_rank] + U = U @ torch.diag(S) + Vh = Vh[:new_rank, :] + if clamp is not None: + dist = torch.cat([U.flatten(), Vh.flatten()]) + hi_val = torch.quantile(dist, clamp) + low_val = -hi_val + U = U.clamp(low_val, hi_val) + Vh = Vh.clamp(low_val, hi_val) + if conv2d: + U = U.reshape(target_lora_B.data.shape) + Vh = Vh.reshape(target_lora_A.data.shape) + return Vh, U + + def _generalized_task_arithmetic_weighted_adapter( + self, + combination_type, + adapters, + weights, + target, + density, + majority_sign_method, + ): + # account weights for LoRA A and B layers. + valid_weights = [] + lora_A_deltas = [] + lora_B_deltas = [] + for adapter, weight in zip(adapters, weights): + if adapter in target.lora_A: + current_adapter_lora_A = target.lora_A[adapter].weight + current_adapter_lora_B = target.lora_B[adapter].weight + elif adapter in target.lora_embedding_A: + current_adapter_lora_A = target.lora_embedding_A[adapter] + current_adapter_lora_B = target.lora_embedding_B[adapter] + else: + continue + valid_weights.append(math.sqrt(weight * target.scaling[adapter])) + lora_A_deltas.append(current_adapter_lora_A.data) + lora_B_deltas.append(current_adapter_lora_B.data) + valid_weights = torch.tensor(valid_weights).to(lora_A_deltas[0].device) + lora_deltas = [lora_A_deltas, lora_B_deltas] + dtype = lora_A_deltas[0].dtype + for i, task_tensors in enumerate(lora_deltas): + if combination_type == "linear": + lora_deltas[i] = task_arithmetic(task_tensors, valid_weights) + elif combination_type == "ties": + lora_deltas[i] = ties(task_tensors, valid_weights, density, majority_sign_method) + elif combination_type == "dare_linear": + lora_deltas[i] = dare_linear(task_tensors, valid_weights, density) + elif combination_type == "dare_ties": + lora_deltas[i] = dare_ties(task_tensors, valid_weights, density, majority_sign_method) + elif combination_type == "magnitude_prune": + lora_deltas[i] = magnitude_prune(task_tensors, valid_weights, density) + else: + raise ValueError("Invalid combination type") + lora_deltas = [delta.to(dtype) for delta in lora_deltas] + return lora_deltas + + def delete_adapter(self, adapter_name: str) -> None: + """ + Deletes an existing adapter. + + Args: + adapter_name (str): Name of the adapter to be deleted. + """ + if adapter_name not in list(self.peft_config.keys()): + raise ValueError(f"Adapter {adapter_name} does not exist") + del self.peft_config[adapter_name] + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + new_adapter = None + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, LoraLayer): + target.delete_adapter(adapter_name) + if new_adapter is None: + new_adapter = target.active_adapters[:] + + self.active_adapter = new_adapter or [] + + def merge_and_unload( + self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None + ) -> torch.nn.Module: + r""" + This method merges the LoRa layers into the base model. This is needed if someone wants to use the base model + as a standalone model. + + Args: + progressbar (`bool`): + whether to show a progressbar indicating the unload and merge process + safe_merge (`bool`): + whether to activate the safe merging check to check if there is any potential Nan in the adapter + weights + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import PeftModel + + >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b") + >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample" + >>> model = PeftModel.from_pretrained(base_model, peft_model_id) + >>> merged_model = model.merge_and_unload() + ``` + """ + return self._unload_and_optionally_merge( + progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names + ) + + def unload(self) -> torch.nn.Module: + """ + Gets back the base model by removing all the lora modules without merging. This gives back the original base + model. + """ + return self._unload_and_optionally_merge(merge=False) diff --git a/MoRA/peft_mora/tuners/lora/tp_layer.py b/MoRA/peft_mora/tuners/lora/tp_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..57ba0eef2114ae1bdc6d29e385d0b166620aab91 --- /dev/null +++ b/MoRA/peft_mora/tuners/lora/tp_layer.py @@ -0,0 +1,239 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import warnings +from typing import Any, Optional + +import torch +import torch.nn as nn +import torch.nn.init as init + +from peft_mora.tuners.tuners_utils import BaseTunerLayer + +from .layer import LoraLayer + + +class LoraParallelLinear(nn.Module, LoraLayer): + """ + When the target layer parallel_linear is RowParallelLinear, in order to keep the input and output shapes + consistent, we need to split the lora matrix A into rows, and the lora_B at this time should be a complete linear + layer; In the same way, when the target layer is ColumnParallelLinear, we perform column segmentation on lora_B, + while lora_A is still a complete linear layer. + """ + + def __init__( + self, + base_layer, + adapter_name: str, + backend, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, + init_lora_weights: bool = True, + use_rslora: bool = False, + use_dora: bool = False, + **kwargs, + ): + super().__init__() + LoraLayer.__init__(self, base_layer=base_layer) + + if use_dora: + raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") + + self.backend = backend + self.is_parallel_a = isinstance(base_layer, backend.RowParallelLinear) + self.fan_in_fan_out = fan_in_fan_out + self._active_adapter = adapter_name + + megatron_config = kwargs["megatron_config"] + parallel_linear_kwargs = {"megatron_config": megatron_config} + init_method = init.xavier_normal_ + if hasattr(megatron_config, "init_method"): + init_method = megatron_config.init_method + input_is_parallel = True + gather_output = False + if isinstance(base_layer, self.backend.RowParallelLinear): + input_is_parallel = base_layer.input_is_parallel + else: + gather_output = base_layer.gather_output + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + init_method=init_method, + input_is_parallel=input_is_parallel, + gather_output=gather_output, + **parallel_linear_kwargs, + ) + + self.is_target_conv_1d_layer = False + + @property + def is_paralle_a(self): + # TODO: remove it in PEFT 0.10.0 + # See https://github.com/huggingface/peft/pull/1439 for more details + warnings.warn( + "`is_paralle_a` is going to be deprecated in a future release. Please use `is_parallel_a`", FutureWarning + ) + return self.is_parallel_a + + def update_layer( + self, + adapter_name, + r, + lora_alpha, + lora_dropout, + init_lora_weights, + use_rslora, + use_dora=False, + init_method=init.xavier_normal_, + input_is_parallel=True, + gather_output=False, + **parallel_linear_kwargs, + ): + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + self.r[adapter_name] = r + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout[adapter_name] = lora_dropout_layer + + megatron_config = parallel_linear_kwargs["megatron_config"] + # lora needs to be forced to upgrade to 32-bit precision, otherwise it will overflow + megatron_config.params_dtype = torch.float32 + if self.is_parallel_a: + lora_a = self.backend.RowParallelLinear( + input_size=self.in_features, + output_size=r, + bias=False, + input_is_parallel=input_is_parallel, + skip_bias_add=True, + init_method=init_method, + config=megatron_config, + ) + lora_b = nn.Linear(in_features=r, out_features=self.out_features, bias=False, dtype=torch.float32) + else: + lora_a = nn.Linear(in_features=self.in_features, out_features=r, bias=False, dtype=torch.float32) + lora_b = self.backend.ColumnParallelLinear( + input_size=r, + output_size=self.out_features, + bias=False, + gather_output=gather_output, + init_method=init_method, + config=megatron_config, + ) + self.lora_A[adapter_name] = lora_a + self.lora_B[adapter_name] = lora_b + if use_rslora: + self.scaling[adapter_name] = lora_alpha / (r**0.5) + else: + self.scaling[adapter_name] = lora_alpha / r + if init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + + weight = getattr(self.get_base_layer(), "weight", None) + if weight is not None: + # the layer is already completely initialized, this is an update + if weight.dtype.is_floating_point or weight.dtype.is_complex: + self.to(weight.device, dtype=weight.dtype) + else: + self.to(weight.device) + self.set_adapter(self.active_adapters) + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any): + previous_dtype = x.dtype + # If weight is used for matrix multiplication here, the final aggregation operation of the original + # parallel_linear layer will be missing, so we need to directly call its forward function to obtain the + # output of the original parallel_linear layer. + if self.disable_adapters: + if self.merged: + self.unmerge() + result, bias = self.base_layer(x, *args, **kwargs) + elif self.merged: + result, bias = self.base_layer(x, *args, **kwargs) + else: + result, bias = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + x = x.to(lora_A.weight.dtype) + + lora_result = lora_A(dropout(x)) + if isinstance(lora_result, tuple): + lora_result = lora_result[0] + lora_result = lora_B(lora_result) + if isinstance(lora_result, tuple): + lora_result = lora_result[0] + lora_result = lora_result * scaling + + result = result + lora_result + + result = result.to(previous_dtype) + return result, bias + + +def dispatch_megatron( + target: torch.nn.Module, + adapter_name: str, + lora_config, + **kwargs: Any, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if lora_config.megatron_config: + megatron_core = importlib.import_module(lora_config.megatron_core) + else: + megatron_core = None + + if megatron_core and isinstance( + target_base_layer, + (megatron_core.tensor_parallel.ColumnParallelLinear, megatron_core.tensor_parallel.RowParallelLinear), + ): + megatron_kwargs = kwargs.copy() + megatron_config = lora_config.megatron_config + if isinstance(megatron_config, dict): + transformer_config_class = megatron_core.transformer.transformer_config.TransformerConfig + megatron_config = transformer_config_class(**lora_config.megatron_config) + megatron_kwargs["megatron_config"] = megatron_config + if megatron_kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `ColumnParallelLinear` " + "or `RowParallelLinear`. " + "Setting fan_in_fan_out to False." + ) + megatron_kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False + new_module = LoraParallelLinear( + base_layer=target, adapter_name=adapter_name, backend=megatron_core.tensor_parallel, **megatron_kwargs + ) + + return new_module diff --git a/MoRA/peft_mora/tuners/lycoris_utils.py b/MoRA/peft_mora/tuners/lycoris_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4eaa6af8aa942d4a7fd3254db5ed1f64c1743efa --- /dev/null +++ b/MoRA/peft_mora/tuners/lycoris_utils.py @@ -0,0 +1,428 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from abc import abstractmethod +from dataclasses import dataclass, field +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +from tqdm import tqdm + +from peft_mora.config import PeftConfig +from peft_mora.utils import ( + ModulesToSaveWrapper, + _get_submodules, +) + +from .tuners_utils import BaseTuner, BaseTunerLayer, check_adapters_to_merge, check_target_module_exists + + +@dataclass +class LycorisConfig(PeftConfig): + r""" + A base config for LyCORIS like adapters + """ + + rank_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. " + "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}" + ) + }, + ) + alpha_pattern: Optional[dict] = field( + default_factory=dict, + metadata={ + "help": ( + "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. " + "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}" + ) + }, + ) + + +class LycorisLayer(BaseTunerLayer): + r""" + A base layer for LyCORIS like adapters + """ + + # adapter_layer_names needs to be defined on the child class + other_param_names = ("r", "alpha", "scaling", "rank_dropout", "module_dropout") + + def __init__(self, base_layer: nn.Module) -> None: + self.base_layer = base_layer + self.r = {} + self.alpha = {} + self.scaling = {} + self.rank_dropout = {} + self.module_dropout = {} + + # Tuner info + self._disable_adapters = False + self.merged_adapters = [] + + @property + @abstractmethod + def _available_adapters(self) -> set[str]: + ... + + def _init_empty_weights(self, cls, *args, **kwargs) -> None: + # A helper method that allows to initialize the layer of the given class without spending time to initialize the + # model weights. The implementation is inspired by + # https://pytorch.org/docs/stable/generated/torch.nn.utils.skip_init.html but this function cannot be used + # directly. + # Instead of this approach, it would be possible to bypass the __init__ of the class but that runs the risk of + # omitting important logic inside that __init__. + kwargs = kwargs.copy() + final_device = kwargs.pop("device", "cpu") + cls.__init__(self, *args, device="meta", **kwargs) + self.to_empty(device=final_device) + + @abstractmethod + def create_adapter_parameters(self, adapter_name: str, r: int, **kwargs): + ... + + # TODO: refactor LoRA to use the same approach + @abstractmethod + def _get_delta_activations(self, adapter_name: str, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + """Activations added on top of the base layer output (i.e. after the base layer forward pass)""" + + @abstractmethod + def get_delta_weight(self, adapter_name: str) -> torch.Tensor: + ... + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self._available_adapters: + base_layer = self.get_base_layer() + if safe_merge: + orig_weights = base_layer.weight.data.clone() + orig_weights += self.get_delta_weight(active_adapter) + + if not torch.isfinite(orig_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = orig_weights + else: + base_layer.weight.data += self.get_delta_weight(active_adapter) + self.merged_adapters.append(active_adapter) + + @abstractmethod + def reset_adapter_parameters(self, adapter_name: str): + ... + + def set_scale(self, adapter, scale): + if adapter not in self._available_adapters: + # Ignore the case where the adapter is not in the layer + return + self.scaling[adapter] = scale * self.alpha[adapter] / self.r[adapter] + + def scale_layer(self, scale: float) -> None: + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + self.scaling[active_adapter] *= scale + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self._available_adapters: + self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + + def unscale_layer(self, scale=None) -> None: + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + if scale is None: + self.scaling[active_adapter] = self.alpha[active_adapter] / self.r[active_adapter] + else: + self.scaling[active_adapter] /= scale + + @abstractmethod + def update_layer(self, adapter_name: str, r: int, alpha: float, **kwargs): + ... + + +class LycorisTuner(BaseTuner): + r""" + A base tuner for LyCORIS like adapters + """ + + prefix: str + layers_mapping: dict[type[torch.nn.Module], type[LycorisLayer]] + + def __init__(self, model, config, adapter_name): + super().__init__(model, config, adapter_name) + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.model, name) + + @staticmethod + def _check_target_module_exists(config, key): + return check_target_module_exists(config, key) + + @abstractmethod + def _create_and_replace( + self, + config: LycorisConfig, + adapter_name: str, + target: Union[LycorisLayer, nn.Module], + target_name, + parent, + current_key, + ): + ... + + @classmethod + def _create_new_module(cls, config: LycorisConfig, adapter_name: str, target: nn.Module, **kwargs) -> LycorisLayer: + # Find corresponding subtype of provided target module + new_module_cls = None + for subtype, target_cls in cls.layers_mapping.items(): + if ( + hasattr(target, "base_layer") + and isinstance(target.get_base_layer(), subtype) + and isinstance(target, BaseTunerLayer) + ): + # nested tuner layers are allowed + new_module_cls = target_cls + break + elif isinstance(target, subtype): + new_module_cls = target_cls + break + + # We didn't find corresponding type, so adapter for this layer is not supported + if new_module_cls is None: + supported_modules = ", ".join(layer.__name__ for layer in cls.layers_mapping.keys()) + raise ValueError( + f"Target module of type {type(target)} not supported, " + f"currently only adapters for {supported_modules} are supported" + ) + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Conv2d): + new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs) + else: + supported_modules = ", ".join(layer.__name__ for layer in cls.layers_mapping.keys()) + raise ValueError( + f"Target module of type {type(target)} not supported, " + f"currently only adapters for {supported_modules} are supported" + ) + + return new_module + + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + for n, p in model.named_parameters(): + if self.prefix not in n: + p.requires_grad = False + + @staticmethod + def _prepare_adapter_config(peft_config, model_config): + if peft_config.target_modules is None: + raise ValueError("Please specify `target_modules` in `peft_config`") + return peft_config + + def _replace_module(self, parent, child_name, new_module, child): + setattr(parent, child_name, new_module) + # It's not necessary to set requires_grad here, as that is handled by + # _mark_only_adapters_as_trainable + + if not hasattr(new_module, "base_layer"): + new_module.weight = child.weight + if hasattr(child, "bias"): + new_module.bias = child.bias + + if getattr(child, "state", None) is not None: + if hasattr(new_module, "base_layer"): + new_module.base_layer.state = child.state + else: + new_module.state = child.state + new_module.to(child.weight.device) + + # dispatch to correct device + for name, module in new_module.named_modules(): + if self.prefix in name: + module.to(child.weight.device) + + def _set_adapter_layers(self, enabled=True): + for module in self.model.modules(): + if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)): + module.enable_adapters(enabled) + + def _unload_and_optionally_merge( + self, + merge: bool = True, + progressbar: bool = False, + safe_merge: bool = False, + adapter_names: Optional[list[str]] = None, + ): + if merge: + if getattr(self.model, "quantization_method", None) == "gptq": + raise ValueError("Cannot merge LOHA layers when the model is gptq quantized") + + self._unloading_checks(adapter_names) + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + desc = "Unloading " + ("and merging " if merge else "") + "model" + for key in tqdm(key_list, disable=not progressbar, desc=desc): + try: + parent, target, target_name = _get_submodules(self.model, key) + except AttributeError: + continue + + if hasattr(target, "base_layer"): + if merge: + target.merge(safe_merge=safe_merge, adapter_names=adapter_names) + self._replace_module(parent, target_name, target.get_base_layer(), target) + elif isinstance(target, ModulesToSaveWrapper): + # save any additional trainable modules part of `modules_to_save` + new_module = target.modules_to_save[target.active_adapter] + if hasattr(new_module, "base_layer"): + # check if the module is itself a tuner layer + if merge: + new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names) + new_module = new_module.get_base_layer() + setattr(parent, target_name, new_module) + + return self.model + + def enable_adapter_layers(self) -> None: + """Enable all adapters. + + Call this if you have previously disabled all adapters and want to re-enable them. + """ + self._set_adapter_layers(enabled=True) + + def disable_adapter_layers(self) -> None: + """Disable all adapters. + + When disabling all adapters, the model output corresponds to the output of the base model. + """ + self._set_adapter_layers(enabled=False) + + def merge_and_unload( + self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None + ) -> torch.nn.Module: + r""" + This method merges the adapter layers into the base model. This is needed if someone wants to use the base + model as a standalone model. + + Args: + progressbar (`bool`): + whether to show a progressbar indicating the unload and merge process + safe_merge (`bool`): + whether to activate the safe merging check to check if there is any potential Nan in the adapter + weights + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + + """ + return self._unload_and_optionally_merge( + progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names + ) + + def unload(self) -> torch.nn.Module: + """ + Gets back the base model by removing all the lora modules without merging. This gives back the original base + model. + """ + return self._unload_and_optionally_merge(merge=False) + + def set_adapter(self, adapter_name: str | list[str]) -> None: + """Set the active adapter(s). + + Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated. + """ + for module in self.model.modules(): + if isinstance(module, LycorisLayer): + if module.merged: + warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") + module.unmerge() + module.set_adapter(adapter_name) + + def delete_adapter(self, adapter_name: str) -> None: + """ + Deletes an existing adapter. + + Args: + adapter_name (`str`): Name of the adapter to be deleted. + """ + if adapter_name not in list(self.peft_config.keys()): + raise ValueError(f"Adapter {adapter_name} does not exist") + del self.peft_config[adapter_name] + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + new_adapter = None + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, LycorisLayer): + target.delete_adapter(adapter_name) + if new_adapter is None: + new_adapter = target.active_adapters[:] + + self.active_adapter = new_adapter or [] diff --git a/MoRA/peft_mora/tuners/mixed/__init__.py b/MoRA/peft_mora/tuners/mixed/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2955d7258ddcf76b47b38fd6fd5ebeb3d1d6110c --- /dev/null +++ b/MoRA/peft_mora/tuners/mixed/__init__.py @@ -0,0 +1,18 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .model import COMPATIBLE_TUNER_TYPES, MixedModel + + +__all__ = ["COMPATIBLE_TUNER_TYPES", "MixedModel"] diff --git a/MoRA/peft_mora/tuners/mixed/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/mixed/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e1b1ca38b414cba3d28434168d863a7b916b0e87 Binary files /dev/null and b/MoRA/peft_mora/tuners/mixed/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/mixed/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/mixed/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ac119ac5321e1412b8d8f5a89b38e2afbfc1313 Binary files /dev/null and b/MoRA/peft_mora/tuners/mixed/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/mixed/model.py b/MoRA/peft_mora/tuners/mixed/model.py new file mode 100644 index 0000000000000000000000000000000000000000..9a660d6fe9ef2967749abefdf9508f8493d79aa6 --- /dev/null +++ b/MoRA/peft_mora/tuners/mixed/model.py @@ -0,0 +1,339 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import warnings +from typing import Any, Optional, Union + +from torch import nn +from tqdm import tqdm + +from peft_mora.tuners import adalora, loha, lokr, lora, oft +from peft_mora.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists +from peft_mora.utils import ( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + ModulesToSaveWrapper, + PeftType, + _get_submodules, + get_auto_gptq_quant_linear, +) + + +# Collection of constants used for all tuners +COMPATIBLE_TUNER_TYPES = (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA, PeftType.OFT) +PREFIXES = [lora.LoraModel.prefix, lokr.LoKrModel.prefix, loha.LoHaModel.prefix, oft.OFTModel.prefix] +Configs = Union[lora.LoraConfig, loha.LoHaConfig, lokr.LoKrConfig, adalora.AdaLoraConfig, oft.OFTConfig] +Layers = (lora.layer.LoraLayer, loha.layer.LoHaLayer, lokr.layer.LoKrLayer, adalora.layer.AdaLoraLayer, oft.OFTLayer) + + +class MixedModel(BaseTuner): + """ + A class that allows to mix different types of adapters in a single model. + + Note: This class should usually not be initialized directly. Instead, use `get_peft_model` with the argument + `mixed=True`. + + Args: + model (:obj:`nn.Module`): + The model to be tuned. + config (:obj:`PeftConfig`): + The config of the model to be tuned. The adapter type must be compatible. + adapter_name (:obj:`str`): + The name of the first adapter. + """ + + def __init__(self, model: nn.Module, config: Configs, adapter_name: str) -> None: + super().__init__(model, config, adapter_name) + + def _check_new_adapter_config(self, config: Configs) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + if not isinstance(config, Configs.__args__): + raise ValueError( + f"{self.__class__.__name__} only supports {COMPATIBLE_TUNER_TYPES} configs, but got {type(config)}." + ) + + biases = (getattr(config, "bias", None) for config in self.peft_config) + biases = [bias for bias in biases if bias not in (None, "none")] + if len(biases) > 1: + raise ValueError( + f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, " + "set bias to 'none' for all adapters." + ) + + @staticmethod + def _check_target_module_exists(config: Configs, key: str): + return check_target_module_exists(config, key) + + def _create_and_replace( + self, + config: Configs, + *args: Any, + **kwargs: Any, + ) -> None: + if isinstance(config, adalora.AdaLoraConfig): + adalora.AdaLoraModel._create_and_replace(self, config, *args, **kwargs) + elif isinstance(config, lora.LoraConfig): + lora.LoraModel._create_and_replace(self, config, *args, **kwargs) + elif isinstance(config, loha.LoHaConfig): + loha.LoHaModel._create_and_replace(self, config, *args, **kwargs) + elif isinstance(config, lokr.LoKrConfig): + lokr.LoKrModel._create_and_replace(self, config, *args, **kwargs) + elif isinstance(config, oft.OFTConfig): + oft.OFTModel._create_and_replace(self, config, *args, **kwargs) + else: + raise ValueError(f"Unsupported config type {type(config)}, should be one of {COMPATIBLE_TUNER_TYPES}.") + + def _replace_module(self, parent, child_name, new_module, child) -> None: + setattr(parent, child_name, new_module) + # It's not necessary to set requires_grad here, as that is handled by + # _mark_only_adapters_as_trainable + + # child layer wraps the original module, unpack it + if hasattr(child, "base_layer"): + child = child.get_base_layer() + elif hasattr(child, "quant_linear_module"): + # TODO maybe not necessary to have special treatment? + child = child.quant_linear_module + + if not hasattr(new_module, "base_layer"): + new_module.weight = child.weight + if hasattr(child, "bias"): + new_module.bias = child.bias + + if getattr(child, "state", None) is not None: + if hasattr(new_module, "base_layer"): + new_module.base_layer.state = child.state + else: + new_module.state = child.state + new_module.to(child.weight.device) + + # dispatch to correct device + for name, module in new_module.named_modules(): + if any(prefix in name for prefix in PREFIXES): + module.to(child.weight.device) + if "ranknum" in name: + module.to(child.weight.device) + + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + for n, p in model.named_parameters(): + if not any(prefix in n for prefix in PREFIXES): + p.requires_grad = False + + for active_adapter in self.active_adapters: + bias = getattr(self.peft_config[active_adapter], "bias", "none") + if bias == "none": + continue + + if bias == "all": + for n, p in model.named_parameters(): + if "bias" in n: + p.requires_grad = True + elif bias == "lora_only": + # TODO: check if this is needed for other supported types + for m in model.modules(): + if isinstance(m, Layers) and hasattr(m, "bias") and m.bias is not None: + m.bias.requires_grad = True + else: + raise ValueError(f"Requested bias: {bias}, is not implemented.") + + @staticmethod + def _create_new_module(config, adapter_name, target, **kwargs): + gptq_quantization_config = kwargs.get("gptq_quantization_config", None) + AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config) + if (gptq_quantization_config is not None) or (AutoGPTQQuantLinear is not None): + raise ValueError(f"GPTQ quantization not supported for {config.peft_type.value} (yet).") + + loaded_in_8bit = kwargs.pop("loaded_in_8bit", False) + loaded_in_4bit = kwargs.pop("loaded_in_4bit", False) + if loaded_in_8bit or loaded_in_4bit: + raise ValueError(f"8bit and 4bit quantization not supported for {config.peft_type.value} (yet).") + + if isinstance(config, adalora.AdaLoraConfig): + new_module = adalora.AdaLoraModel._create_new_module(config, adapter_name, target, **kwargs) + elif isinstance(config, lora.LoraConfig): + new_module = lora.LoraModel._create_new_module(config, adapter_name, target, **kwargs) + elif isinstance(config, loha.LoHaConfig): + new_module = loha.LoHaModel._create_new_module(config, adapter_name, target, **kwargs) + elif isinstance(config, lokr.LoKrConfig): + new_module = lokr.LoKrModel._create_new_module(config, adapter_name, target, **kwargs) + elif isinstance(config, oft.OFTConfig): + new_module = oft.OFTModel._create_new_module(config, adapter_name, target, **kwargs) + else: + raise ValueError(f"Unknown config type {type(config)}, should be one of {COMPATIBLE_TUNER_TYPES}.") + return new_module + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.model, name) + + def _set_adapter_layers(self, enabled=True): + for module in self.model.modules(): + if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)): + module.enable_adapters(enabled) + + def enable_adapter_layers(self): + self._set_adapter_layers(enabled=True) + + def disable_adapter_layers(self): + for active_adapter in self.active_adapters: + val = getattr(self.peft_config[active_adapter], "bias", "none") + if val != "none": + msg = ( + f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same " + "output as the the base model would without adaption." + ) + warnings.warn(msg) + self._set_adapter_layers(enabled=False) + + def set_adapter(self, adapter_name: Union[str, list[str]]) -> None: + for module in self.model.modules(): + if isinstance(module, Layers): + if module.merged: + warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") + module.unmerge() + module.set_adapter(adapter_name) + self.active_adapter = adapter_name + + @staticmethod + def _prepare_adapter_config(peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING: + raise ValueError("Please specify `target_modules` in `peft_config`") + + peft_config.target_modules = set( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]] + ) + return peft_config + + def _unload_and_optionally_merge( + self, + merge=True, + progressbar: bool = False, + safe_merge: bool = False, + adapter_names: Optional[list[str]] = None, + ): + if merge: + if getattr(self.model, "quantization_method", None) == "gptq": + raise ValueError("Cannot merge layers when the model is gptq quantized") + + def merge_recursively(module): + # helper function to recursively merge the base_layer of the target + path = [] + layer = module + while hasattr(layer, "base_layer"): + path.append(layer) + layer = layer.base_layer + for layer_before, layer_after in zip(path[:-1], path[1:]): + layer_after.merge(safe_merge=safe_merge, adapter_names=adapter_names) + layer_before.base_layer = layer_after.base_layer + module.merge(safe_merge=safe_merge, adapter_names=adapter_names) + + key_list = [key for key, _ in self.model.named_modules() if not any(prefix in key for prefix in PREFIXES)] + desc = "Unloading " + ("and merging " if merge else "") + "model" + + for key in tqdm(key_list, disable=not progressbar, desc=desc): + try: + parent, target, target_name = _get_submodules(self.model, key) + except AttributeError: + continue + + if hasattr(target, "base_layer"): + if merge: + merge_recursively(target) + self._replace_module(parent, target_name, target.get_base_layer(), target) + elif isinstance(target, ModulesToSaveWrapper): + # save any additional trainable modules part of `modules_to_save` + new_module = target.modules_to_save[target.active_adapter] + if hasattr(new_module, "base_layer"): + # check if the module is itself a tuner layer + if merge: + new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names) + new_module = new_module.get_base_layer() + setattr(parent, target_name, new_module) + + return self.model + + def add_weighted_adapter(self, *args: Any, **kwargs: Any) -> None: + raise NotImplementedError(f"Weighted adapters are not supported for {self.__class__.__name__} (yet).") + + def delete_adapter(self, adapter_name: Union[str, list[str]]) -> None: + """ + Deletes an existing adapter. + + Args: + adapter_name (Union[str, list[str]]): Name of the adapter(s) to delete. + """ + if isinstance(adapter_name, str): + adapter_names = [adapter_name] + else: + adapter_names = adapter_name + + mismatched = set(adapter_names) - set(self.peft_config.keys()) + if mismatched: + raise ValueError( + f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}" + ) + + for adapter_name in adapter_names: + del self.peft_config[adapter_name] + + key_list = [key for key, _ in self.model.named_modules() if not any(prefix in key for prefix in PREFIXES)] + new_adapter = None + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, BaseTunerLayer): + target.delete_adapter(adapter_name) + if new_adapter is None: + new_adapter = target.active_adapters[:] + + self.active_adapter = new_adapter or [] + + def merge_and_unload( + self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None + ) -> nn.Module: + r""" + This method merges the layers into the base model. This is needed if someone wants to use the base model as a + standalone model. + + Args: + progressbar (`bool`): + whether to show a progressbar indicating the unload and merge process + safe_merge (`bool`): + whether to activate the safe merging check to check if there is any potential Nan in the adapter + weights + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + """ + return self._unload_and_optionally_merge( + progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names + ) + + def unload(self) -> nn.Module: + """ + Gets back the base model by removing all the lora modules without merging. This gives back the original base + model. + """ + return self._unload_and_optionally_merge(merge=False) + + def generate(self, *args: Any, **kwargs: Any): + return self.model.generate(*args, **kwargs) diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/__init__.py b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..214f7722486485bea4ede3b5c1a433aac447dd2b --- /dev/null +++ b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit +from .model import MultitaskPromptEmbedding + + +__all__ = ["MultitaskPromptTuningConfig", "MultitaskPromptTuningInit", "MultitaskPromptEmbedding"] diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cef42d1dc07a447378c63f4c398ce54713af2cd8 Binary files /dev/null and b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10e711620d8d551562b91eaf69ceff945fd003eb Binary files /dev/null and b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ffef858cdcd56a0df6a779d40f7b39e2dad4f5f Binary files /dev/null and b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/config.py b/MoRA/peft_mora/tuners/multitask_prompt_tuning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..3ccd12e2df8bc6869ae00a24cef946f8fa37b51a --- /dev/null +++ b/MoRA/peft_mora/tuners/multitask_prompt_tuning/config.py @@ -0,0 +1,61 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft_mora.tuners.prompt_tuning import PromptTuningConfig +from peft_mora.utils import PeftType + + +class MultitaskPromptTuningInit(str, enum.Enum): + # initialize prompt with text + TEXT = "TEXT" + # initialize prompt with random matrix + RANDOM = "RANDOM" + # average the prefix and column matrices obtained during source training + AVERAGE_SOURCE_TASKS = "AVERAGE_SOURCE_TASKS" + # pick prefix and column matrices for a particular task obtained during source training + EXACT_SOURCE_TASK = "EXACT_SOURCE_TASK" + # only use the prompt embeddings trained during source training + ONLY_SOURCE_SHARED = "ONLY_SOURCE_SHARED" + + +@dataclass +class MultitaskPromptTuningConfig(PromptTuningConfig): + prompt_tuning_init: Union[MultitaskPromptTuningInit, str] = field( + default=MultitaskPromptTuningInit.RANDOM, + metadata={ + "help": ( + "How to initialize the prompt tuning parameters. Can be one of TEXT, RANDOM, AVERAGE_SOURCE_TASKS, " + "EXACT_SOURCE_TASK, ONLY_SOURCE_SHARED." + ), + }, + ) + prompt_tuning_init_state_dict_path: Optional[str] = field( + default=None, + metadata={ + "help": ( + "The path of source state dict. This is required when training the downstream target prompt from " + "the pretrained source prompt" + ), + }, + ) + prompt_tuning_init_task: Optional[int] = field(default=0, metadata={"help": "source task id for initialization"}) + num_ranks: Optional[int] = field(default=1, metadata={"help": "ranks"}) + num_tasks: Optional[int] = field(default=1, metadata={"help": "number of tasks"}) + + def __post_init__(self): + self.peft_type = PeftType.MULTITASK_PROMPT_TUNING diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/model.py b/MoRA/peft_mora/tuners/multitask_prompt_tuning/model.py new file mode 100644 index 0000000000000000000000000000000000000000..b317b94c8c07afe3a85367cf045bc2be54ace773 --- /dev/null +++ b/MoRA/peft_mora/tuners/multitask_prompt_tuning/model.py @@ -0,0 +1,115 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from peft_mora.tuners.prompt_tuning import PromptEmbedding +from peft_mora.utils import TaskType + +from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit + + +# This code is adapted for the paper: https://arxiv.org/abs/2303.02861 and +# constitutes the work done at MIT-IBM Watson Research Lab. + + +class MultitaskPromptEmbedding(PromptEmbedding): + def __init__(self, config: MultitaskPromptTuningConfig, word_embeddings): + super().__init__(config, word_embeddings) + + self.num_tasks = config.num_tasks + self.num_ranks = config.num_ranks + self.num_virtual_tokens = config.num_virtual_tokens + + self.num_transformer_submodules = config.num_transformer_submodules + if self.num_transformer_submodules is None: + self.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1 + + self.token_dim = config.token_dim + + total_virtual_tokens = self.num_virtual_tokens * self.num_transformer_submodules + + self.prefix_task_cols = torch.nn.Parameter( + torch.normal( + mean=0, + std=0.02, + size=(self.num_tasks, total_virtual_tokens, self.num_ranks), + ) + ) + self.prefix_task_rows = torch.nn.Parameter( + torch.normal( + mean=0, + std=0.02, + size=(self.num_tasks, self.num_ranks, self.token_dim), + ) + ) + + if config.prompt_tuning_init in [ + MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS, + MultitaskPromptTuningInit.EXACT_SOURCE_TASK, + MultitaskPromptTuningInit.ONLY_SOURCE_SHARED, + ]: + if config.prompt_tuning_init_state_dict_path is None: + raise ValueError( + f"prompt_tuning_init_state_dict_path needs to be specified with {config.prompt_tuning_init} " + "init method" + ) + + # TODO: There should be an option for safetensors + state_dict: dict = torch.load( + config.prompt_tuning_init_state_dict_path, + map_location=word_embeddings.weight.device, + ) + + if config.prompt_tuning_init in [ + MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS, + MultitaskPromptTuningInit.EXACT_SOURCE_TASK, + ]: + prefix_task_cols_: torch.Tensor = state_dict["prefix_task_cols"] + prefix_task_rows_: torch.Tensor = state_dict["prefix_task_rows"] + + if config.prompt_tuning_init == MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS: + prefix_task_cols_ = prefix_task_cols_.mean(0, keepdim=True) + prefix_task_rows_ = prefix_task_rows_.mean(0, keepdim=True) + elif config.prompt_tuning_init == MultitaskPromptTuningInit.EXACT_SOURCE_TASK: + prefix_task_cols_ = prefix_task_cols_[config.prompt_tuning_init_task, ...].unsqueeze(0) + prefix_task_rows_ = prefix_task_rows_[config.prompt_tuning_init_task, ...].unsqueeze(0) + + state_dict = { + "embedding.weight": state_dict["prompt_embeddings"], + "prefix_task_cols": prefix_task_cols_, + "prefix_task_rows": prefix_task_rows_, + } + + self.load_state_dict(state_dict, strict=True) + elif config.prompt_tuning_init == MultitaskPromptTuningInit.ONLY_SOURCE_SHARED: + state_dict = { + "embedding.weight": state_dict["prompt_embeddings"], + } + + self.load_state_dict(state_dict, strict=False) + + def forward(self, indices, task_ids): + if task_ids is None: + raise ValueError("task_ids cannot be None") + + prompt_embeddings = self.embedding(indices) + + task_cols = torch.index_select(self.prefix_task_cols, 0, task_ids) + task_rows = torch.index_select(self.prefix_task_rows, 0, task_ids) + task_prompts = torch.matmul(task_cols, task_rows) + + prompt_embeddings *= task_prompts + + return prompt_embeddings diff --git a/MoRA/peft_mora/tuners/oft/__init__.py b/MoRA/peft_mora/tuners/oft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..52ac7131e24bd5cf39bf97ab6336ed1f1d46e152 --- /dev/null +++ b/MoRA/peft_mora/tuners/oft/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import OFTConfig +from .layer import Conv2d, Linear, OFTLayer +from .model import OFTModel + + +__all__ = ["OFTConfig", "OFTModel", "Conv2d", "Linear", "OFTLayer"] diff --git a/MoRA/peft_mora/tuners/oft/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/oft/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f9908cd15f1115e8f6d388f6583e0dd421e7a4b9 Binary files /dev/null and b/MoRA/peft_mora/tuners/oft/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/oft/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/oft/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3072f6cd987abe6bcfd9190552e2a74067daede9 Binary files /dev/null and b/MoRA/peft_mora/tuners/oft/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/oft/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/oft/__pycache__/layer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d8864ed7cdb39cd89a6140196bc359a03775f8ef Binary files /dev/null and b/MoRA/peft_mora/tuners/oft/__pycache__/layer.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/oft/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/oft/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0348b67b0f06e2d7257140ca2fab33c4333490ae Binary files /dev/null and b/MoRA/peft_mora/tuners/oft/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/oft/config.py b/MoRA/peft_mora/tuners/oft/config.py new file mode 100644 index 0000000000000000000000000000000000000000..4a6ed2ba8f04a57b878d17ee482e454d23617b2d --- /dev/null +++ b/MoRA/peft_mora/tuners/oft/config.py @@ -0,0 +1,119 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import List, Optional, Union + +from peft_mora.tuners.lycoris_utils import LycorisConfig +from peft_mora.utils import PeftType + + +@dataclass +class OFTConfig(LycorisConfig): + """ + This is the configuration class to store the configuration of a [`OFTModel`]. + + Args: + r (`int`): OFT rank. + module_dropout (`int`): The dropout probability for disabling OFT modules during training. + target_modules (`Optional[Union[List[str], str]]`): + The names of the modules to apply the adapter to. If this is specified, only the modules with the specified + names will be replaced. When passing a string, a regex match will be performed. When passing a list of + strings, either an exact match will be performed or it is checked if the name of the module ends with any + of the passed strings. If this is specified as 'all-linear', then all linear modules are chosen, excluding + the output layer. If this is not specified, modules will be chosen according to the model architecture. If + the architecture is not known, an error will be raised -- in this case, you should specify the target + modules manually. + init_weights (`bool`): + Whether to perform initialization of OFT weights. + layers_to_transform (`Union[List[int], int]`): + The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices + that are specified in this list. If a single integer is passed, it will apply the transformations on the + layer at this index. + layers_pattern (`str`): + The layer pattern name, used only if `layers_to_transform` is different from `None`. + rank_pattern (`dict`): + The mapping from layer names or regexp expression to ranks which are different from the default rank + specified by `r`. + modules_to_save (`List[str]`): + List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. + coft (`bool`): + Whether to use the constrained variant of OFT or not, off by default. + eps (`float`): + The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True. + block_share (`bool`): + Whether to share the OFT parameters between blocks or not. This is `False` by default. + """ + + r: int = field(default=8, metadata={"help": "OFT rank"}) + module_dropout: float = field( + default=0.0, metadata={"help": "The dropout probability for disabling OFT modules during training"} + ) + target_modules: Optional[Union[List[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with OFT." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " + "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the OFT layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + layers_to_transform: Optional[Union[List[int], int]] = field( + default=None, + metadata={ + "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index." + }, + ) + layers_pattern: Optional[str] = field( + default=None, + metadata={ + "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." + }, + ) + modules_to_save: Optional[List[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from OFT layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + coft: bool = field( + default=False, + metadata={"help": "Whether to use the constrained variant of OFT or not."}, + ) + eps: float = field( + default=6e-5, + metadata={ + "help": "The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True." + }, + ) + block_share: bool = field( + default=False, + metadata={"help": "Whether to share the OFT parameters between blocks or not."}, + ) + + def __post_init__(self): + self.peft_type = PeftType.OFT + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) diff --git a/MoRA/peft_mora/tuners/oft/layer.py b/MoRA/peft_mora/tuners/oft/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..5ed3df15bd94eaadf0518497a1e1622b7aec86b5 --- /dev/null +++ b/MoRA/peft_mora/tuners/oft/layer.py @@ -0,0 +1,388 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import warnings +from typing import Any, List, Optional, Set, Tuple + +import torch +import torch.nn as nn + +from peft_mora.tuners.lycoris_utils import LycorisLayer, check_adapters_to_merge + + +class OFTLayer(nn.Module, LycorisLayer): + # All names of layers that may contain adapter weights + adapter_layer_names = ("oft_r",) + # other_param_names is defined on parent class + + def __init__(self, base_layer: nn.Module): + super().__init__() + LycorisLayer.__init__(self, base_layer) + + # OFT info + self.oft_r = nn.ParameterDict({}) + self.coft = {} + self.eps = {} + self.block_share = {} + + @property + def _available_adapters(self) -> Set[str]: + return {*self.oft_r} + + def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...], block_share: bool): + if block_share: + self.oft_r[adapter_name] = nn.Parameter(torch.empty(1, math.ceil(shape[0] / r), math.ceil(shape[0] / r))) + else: + self.oft_r[adapter_name] = nn.Parameter(torch.empty(r, math.ceil(shape[0] / r), math.ceil(shape[0] / r))) + + def reset_adapter_parameters(self, adapter_name: str): + nn.init.zeros_(self.oft_r[adapter_name]) + + def reset_adapter_parameters_random(self, adapter_name: str): + nn.init.kaiming_uniform_(self.oft_r[adapter_name], a=math.sqrt(5)) + + def update_layer( + self, + adapter_name: str, + r: int, + module_dropout: float, + init_weights: bool, + coft: bool = False, + eps: float = 6e-5, + block_share: bool = False, + **kwargs, + ) -> None: + """Internal function to create oft adapter + + Args: + adapter_name (`str`): Name for the adapter to add. + r (`int`): Rank for the added adapter. + module_dropout (`float`): The dropout probability for disabling adapter during training. + init_weights (`bool`): Whether to initialize weights. + coft (`bool`): Whether to use the constrained variant of OFT or not. + eps (`float`): + The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True. + block_share (`bool`): Whether to share the OFT parameters between blocks or not. + """ + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + self.module_dropout[adapter_name] = module_dropout + self.coft[adapter_name] = coft + self.block_share[adapter_name] = block_share + + # Determine shape of OFT weights + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + shape = tuple(base_layer.weight.shape) + elif isinstance(base_layer, nn.Conv2d): + shape = ( + base_layer.out_channels, + base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1], + ) + else: + raise TypeError(f"OFT is not implemented for base layers of type {type(base_layer).__name__}") + + self.eps[adapter_name] = eps * math.ceil(shape[0] / r) * math.ceil(shape[0] / r) + + # Create weights with provided shape + self.create_adapter_parameters(adapter_name, r, shape, block_share) + + # Initialize weights + if init_weights: + self.reset_adapter_parameters(adapter_name) + else: + self.reset_adapter_parameters_random(adapter_name) + + # Move new weights to device + weight = getattr(self.get_base_layer(), "weight", None) + if weight is not None: + # the layer is already completely initialized, this is an update + if weight.dtype.is_floating_point or weight.dtype.is_complex: + self.to(weight.device, dtype=weight.dtype) + else: + self.to(weight.device) + self.set_adapter(self.active_adapters) + + def unscale_layer(self, scale=None) -> None: + # scale is not used + pass + + def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None: + """ + Merge the active adapter weights into the base weights + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + adapter_names = check_adapters_to_merge(self, adapter_names) + if not adapter_names: + # no adapter to merge + return + + for active_adapter in adapter_names: + if active_adapter in self._available_adapters: + base_layer = self.get_base_layer() + + orig_weights = base_layer.weight.data + if isinstance(base_layer, nn.Linear): + orig_weights = torch.transpose(orig_weights, 0, 1) + elif isinstance(base_layer, nn.Conv2d): + orig_weights = orig_weights.view( + [ + base_layer.out_channels, + base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1], + ] + ) + orig_weights = torch.transpose(orig_weights, 0, 1) + delta_weight = self.get_delta_weight(active_adapter) + if orig_weights.shape[1] != delta_weight.shape[1]: + # when in channels is not divisible by r + delta_weight = delta_weight[: orig_weights.shape[1], : orig_weights.shape[1]] + new_weights = torch.mm(orig_weights, delta_weight) + if isinstance(base_layer, nn.Linear): + new_weights = torch.transpose(new_weights, 0, 1) + elif isinstance(base_layer, nn.Conv2d): + new_weights = torch.transpose(new_weights, 0, 1) + new_weights = new_weights.view( + [ + base_layer.out_channels, + base_layer.in_channels, + base_layer.kernel_size[0], + base_layer.kernel_size[1], + ] + ) + + if safe_merge and not torch.isfinite(new_weights).all(): + raise ValueError( + f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + ) + + base_layer.weight.data = new_weights + self.merged_adapters.append(active_adapter) + + def unmerge(self) -> None: + """ + This method unmerges all merged adapter layers from the base weights. + """ + if not self.merged: + warnings.warn("Already unmerged. Nothing to do.") + return + while len(self.merged_adapters) > 0: + active_adapter = self.merged_adapters.pop() + if active_adapter in self._available_adapters: + base_layer = self.get_base_layer() + new_weights = base_layer.weight.data + if isinstance(base_layer, nn.Linear): + new_weights = torch.transpose(new_weights, 0, 1) + elif isinstance(base_layer, nn.Conv2d): + new_weights = new_weights.view( + [ + base_layer.out_channels, + base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1], + ] + ) + new_weights = torch.transpose(new_weights, 0, 1) + delta_weight = self.get_delta_weight(active_adapter) + if new_weights.shape[1] != delta_weight.shape[1]: + # when in channels is not divisible by r + delta_weight = delta_weight[: new_weights.shape[1], : new_weights.shape[1]] + delta_inv = torch.inverse(delta_weight) + orig_weights = torch.mm(new_weights, delta_inv) + + if isinstance(base_layer, nn.Linear): + orig_weights = torch.transpose(orig_weights, 0, 1) + elif isinstance(base_layer, nn.Conv2d): + orig_weights = torch.transpose(orig_weights, 0, 1) + orig_weights = orig_weights.reshape( + [ + base_layer.out_channels, + base_layer.in_channels, + base_layer.kernel_size[0], + base_layer.kernel_size[1], + ] + ) + base_layer.weight.data = orig_weights + + def get_delta_weight(self, adapter_name: str) -> torch.Tensor: + rank = self.r[adapter_name] + coft = self.coft[adapter_name] + eps = self.eps[adapter_name] + opt_r = self.oft_r[adapter_name] + + if coft: + with torch.no_grad(): + opt_r.copy_(self._project_batch(opt_r, eps=eps)) + + orth_rotate = self._cayley_batch(opt_r) + weight = self._block_diagonal(orth_rotate, rank) + + return weight + + # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L144 + def _cayley_batch(self, data: torch.Tensor) -> torch.Tensor: + b, r, c = data.shape + # Ensure the input matrix is skew-symmetric + skew = 0.5 * (data - data.transpose(1, 2)) + I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c) # noqa: E741 + + # Perform the Cayley parametrization + Q = torch.bmm(I - skew, torch.inverse(I + skew)) + + return Q + + # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L155 + def _block_diagonal(self, oft_r: torch.Tensor, rank: int) -> torch.Tensor: + if oft_r.shape[0] == 1: + # block share + blocks = [oft_r[0, ...] for i in range(rank)] + else: + blocks = [oft_r[i, ...] for i in range(rank)] + + # Use torch.block_diag to create the block diagonal matrix + A = torch.block_diag(*blocks) + + return A + + # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L52 + def _project_batch(self, oft_r, eps=1e-5): + # scaling factor for each of the smaller block matrix + eps = eps * 1 / torch.sqrt(torch.tensor(oft_r.shape[0])) + I = ( # noqa: E741 + torch.zeros((oft_r.size(1), oft_r.size(1)), device=oft_r.device, dtype=oft_r.dtype) + .unsqueeze(0) + .expand_as(oft_r) + ) + diff = oft_r - I + norm_diff = torch.norm(oft_r - I, dim=(1, 2), keepdim=True) + mask = (norm_diff <= eps).bool() + out = torch.where(mask, oft_r, I + eps * (diff / norm_diff)) + return out + + def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: + previous_dtype = x.dtype + + if self.disable_adapters: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + if len(result.shape) == 4: + result = result.permute(0, 2, 3, 1) + + base_layer = self.get_base_layer() + base_bias = base_layer.bias + if base_bias is not None: + # Bias should be added after OFT forward + result = result - base_bias.data + + # Execute all the adapters + for active_adapter in self.active_adapters: + if active_adapter not in self._available_adapters: + continue + + module_dropout = self.module_dropout[active_adapter] + + # Modify current execution weights + if (not self.training) or (self.training and torch.rand(1) > module_dropout): + result = self._get_delta_activations(active_adapter, result, *args, **kwargs) + + if base_bias is not None: + result = result + base_bias.data + if len(result.shape) == 4: + result = result.permute(0, 3, 1, 2) + + result = result.to(previous_dtype) + return result + + +class Linear(OFTLayer): + """OFT implemented in Linear layer""" + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str = "default", + r: int = 0, + module_dropout: float = 0.0, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + + base_layer = self.get_base_layer() + base_weight = base_layer.weight.data + delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]] + + # don't add bias here, because the bias will be added after OFT forward + return torch.matmul(input, delta_weight) + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep + + +class Conv2d(OFTLayer): + """OFT implemented in Conv2d layer""" + + def __init__( + self, + base_layer: nn.Module, + adapter_name: str = "default", + r: int = 0, + module_dropout: float = 0.0, + init_weights: bool = True, + **kwargs, + ): + super().__init__(base_layer) + + # Create adapter and set it active + self._active_adapter = adapter_name + self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs) + + def _get_delta_activations( + self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any + ) -> torch.Tensor: + delta_weight = self.get_delta_weight(adapter_name) + + base_layer = self.get_base_layer() + base_weight = base_layer.weight.data + delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]] + + # don't add bias here, because the bias will be added after OFT forward + return torch.matmul(input, delta_weight) + + def __repr__(self) -> str: + rep = super().__repr__() + return "oft." + rep diff --git a/MoRA/peft_mora/tuners/oft/model.py b/MoRA/peft_mora/tuners/oft/model.py new file mode 100644 index 0000000000000000000000000000000000000000..c27dd7a73a0d05202763689fa1e24a4e37f42047 --- /dev/null +++ b/MoRA/peft_mora/tuners/oft/model.py @@ -0,0 +1,106 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from typing import Dict, Type, Union + +import torch +from torch import nn + +from peft_mora.tuners.lycoris_utils import LycorisConfig, LycorisTuner + +from .layer import Conv2d, Linear, OFTLayer + + +class OFTModel(LycorisTuner): + """ + Creates Orthogonal Finetuning model from a pretrained model. The method is described in + https://arxiv.org/abs/2306.07280 + + Args: + model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached. + config ([`OFTConfig`]): The configuration of the OFT model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + + Returns: + `torch.nn.Module`: The OFT model. + + Example: + ```py + >>> from diffusers import StableDiffusionPipeline + >>> from peft import OFTModel, OFTConfig + + >>> config_te = OFTConfig( + ... r=8, + ... target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"], + ... module_dropout=0.0, + ... init_weights=True, + ... ) + >>> config_unet = OFTConfig( + ... r=8, + ... target_modules=[ + ... "proj_in", + ... "proj_out", + ... "to_k", + ... "to_q", + ... "to_v", + ... "to_out.0", + ... "ff.net.0.proj", + ... "ff.net.2", + ... ], + ... module_dropout=0.0, + ... init_weights=True, + ... ) + + >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> model.text_encoder = OFTModel(model.text_encoder, config_te, "default") + >>> model.unet = OFTModel(model.unet, config_unet, "default") + ``` + + **Attributes**: + - **model** ([`~torch.nn.Module`]) -- The model to be adapted. + - **peft_config** ([`OFTConfig`]): The configuration of the OFT model. + """ + + prefix: str = "oft_" + layers_mapping: Dict[Type[torch.nn.Module], Type[OFTLayer]] = { + torch.nn.Conv2d: Conv2d, + torch.nn.Linear: Linear, + } + + def _create_and_replace( + self, + config: LycorisConfig, + adapter_name: str, + target: Union[OFTLayer, nn.Module], + target_name: str, + parent: nn.Module, + current_key: str, + ) -> None: + """ + A private method to create and replace the target module with the adapter module. + """ + + # Regexp matching - Find key which matches current target_name in patterns provided + pattern_keys = list(config.rank_pattern.keys()) + target_name_key = next(filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys), target_name) + + kwargs = config.to_dict() + kwargs["r"] = config.rank_pattern.get(target_name_key, config.r) + + if isinstance(target, OFTLayer): + target.update_layer(adapter_name, **kwargs) + else: + new_module = self._create_new_module(config, adapter_name, target, **kwargs) + self._replace_module(parent, target_name, new_module, target) diff --git a/MoRA/peft_mora/tuners/p_tuning/__init__.py b/MoRA/peft_mora/tuners/p_tuning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7dd3a6ba3e4442354302c5bfe3da75f1d6f69d02 --- /dev/null +++ b/MoRA/peft_mora/tuners/p_tuning/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import PromptEncoderConfig, PromptEncoderReparameterizationType +from .model import PromptEncoder + + +__all__ = ["PromptEncoder", "PromptEncoderConfig", "PromptEncoderReparameterizationType"] diff --git a/MoRA/peft_mora/tuners/p_tuning/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/p_tuning/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..76895b4f965d5be0a8c27fcf19250d1bc5dd5a3e Binary files /dev/null and b/MoRA/peft_mora/tuners/p_tuning/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/p_tuning/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/p_tuning/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70fa7ea53e8aa01cc82ebda6cc5e40fd599385d2 Binary files /dev/null and b/MoRA/peft_mora/tuners/p_tuning/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/p_tuning/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/p_tuning/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9b7d0f1ff1db4ee9218a1f39d510fbfd9524b0e7 Binary files /dev/null and b/MoRA/peft_mora/tuners/p_tuning/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/p_tuning/config.py b/MoRA/peft_mora/tuners/p_tuning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..d6288c1b5e48f747489c43fb1e4067a625facdba --- /dev/null +++ b/MoRA/peft_mora/tuners/p_tuning/config.py @@ -0,0 +1,59 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +from dataclasses import dataclass, field +from typing import Union + +from peft_mora.config import PromptLearningConfig +from peft_mora.utils import PeftType + + +class PromptEncoderReparameterizationType(str, enum.Enum): + MLP = "MLP" + LSTM = "LSTM" + + +@dataclass +class PromptEncoderConfig(PromptLearningConfig): + """ + This is the configuration class to store the configuration of a [`PromptEncoder`]. + + Args: + encoder_reparameterization_type (Union[[`PromptEncoderReparameterizationType`], `str`]): + The type of reparameterization to use. + encoder_hidden_size (`int`): The hidden size of the prompt encoder. + encoder_num_layers (`int`): The number of layers of the prompt encoder. + encoder_dropout (`float`): The dropout probability of the prompt encoder. + """ + + encoder_reparameterization_type: Union[str, PromptEncoderReparameterizationType] = field( + default=PromptEncoderReparameterizationType.MLP, + metadata={"help": "How to reparameterize the prompt encoder"}, + ) + encoder_hidden_size: int = field( + default=None, + metadata={"help": "The hidden size of the prompt encoder"}, + ) + encoder_num_layers: int = field( + default=2, + metadata={"help": "The number of layers of the prompt encoder"}, + ) + encoder_dropout: float = field( + default=0.0, + metadata={"help": "The dropout of the prompt encoder"}, + ) + + def __post_init__(self): + self.peft_type = PeftType.P_TUNING diff --git a/MoRA/peft_mora/tuners/p_tuning/model.py b/MoRA/peft_mora/tuners/p_tuning/model.py new file mode 100644 index 0000000000000000000000000000000000000000..ade2b1128158376c134441687803b85d444cfb96 --- /dev/null +++ b/MoRA/peft_mora/tuners/p_tuning/model.py @@ -0,0 +1,130 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Based on https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/prompt_encoder.py +# with some refactor +import warnings + +import torch + +from .config import PromptEncoderConfig, PromptEncoderReparameterizationType + + +class PromptEncoder(torch.nn.Module): + """ + The prompt encoder network that is used to generate the virtual token embeddings for p-tuning. + + Args: + config ([`PromptEncoderConfig`]): The configuration of the prompt encoder. + + Example: + + ```py + >>> from peft import PromptEncoder, PromptEncoderConfig + + >>> config = PromptEncoderConfig( + ... peft_type="P_TUNING", + ... task_type="SEQ_2_SEQ_LM", + ... num_virtual_tokens=20, + ... token_dim=768, + ... num_transformer_submodules=1, + ... num_attention_heads=12, + ... num_layers=12, + ... encoder_reparameterization_type="MLP", + ... encoder_hidden_size=768, + ... ) + + >>> prompt_encoder = PromptEncoder(config) + ``` + + **Attributes**: + - **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prompt encoder. + - **mlp_head** (`torch.nn.Sequential`) -- The MLP head of the prompt encoder if `inference_mode=False`. + - **lstm_head** (`torch.nn.LSTM`) -- The LSTM head of the prompt encoder if `inference_mode=False` and + `encoder_reparameterization_type="LSTM"`. + - **token_dim** (`int`) -- The hidden embedding dimension of the base transformer model. + - **input_size** (`int`) -- The input size of the prompt encoder. + - **output_size** (`int`) -- The output size of the prompt encoder. + - **hidden_size** (`int`) -- The hidden size of the prompt encoder. + - **total_virtual_tokens** (`int`): The total number of virtual tokens of the + prompt encoder. + - **encoder_type** (Union[[`PromptEncoderReparameterizationType`], `str`]): The encoder type of the prompt + encoder. + + + Input shape: (`batch_size`, `total_virtual_tokens`) + + Output shape: (`batch_size`, `total_virtual_tokens`, `token_dim`) + """ + + def __init__(self, config): + super().__init__() + self.token_dim = config.token_dim + self.input_size = self.token_dim + self.output_size = self.token_dim + self.hidden_size = config.encoder_hidden_size + self.total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules + self.encoder_type = config.encoder_reparameterization_type + + # embedding + self.embedding = torch.nn.Embedding(self.total_virtual_tokens, self.token_dim) + if not config.inference_mode: + if self.encoder_type == PromptEncoderReparameterizationType.LSTM: + lstm_dropout = config.encoder_dropout + num_layers = config.encoder_num_layers + # LSTM + self.lstm_head = torch.nn.LSTM( + input_size=self.input_size, + hidden_size=self.hidden_size, + num_layers=num_layers, + dropout=lstm_dropout, + bidirectional=True, + batch_first=True, + ) + + self.mlp_head = torch.nn.Sequential( + torch.nn.Linear(self.hidden_size * 2, self.hidden_size * 2), + torch.nn.ReLU(), + torch.nn.Linear(self.hidden_size * 2, self.output_size), + ) + + elif self.encoder_type == PromptEncoderReparameterizationType.MLP: + encoder_num_layers_default = PromptEncoderConfig.encoder_num_layers + if config.encoder_num_layers != encoder_num_layers_default: + warnings.warn( + f"for {self.encoder_type.value}, the argument `encoder_num_layers` is ignored. " + f"Exactly {encoder_num_layers_default} MLP layers are used." + ) + layers = [ + torch.nn.Linear(self.input_size, self.hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(self.hidden_size, self.hidden_size), + torch.nn.ReLU(), + torch.nn.Linear(self.hidden_size, self.output_size), + ] + self.mlp_head = torch.nn.Sequential(*layers) + + else: + raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.") + + def forward(self, indices): + input_embeds = self.embedding(indices) + if self.encoder_type == PromptEncoderReparameterizationType.LSTM: + output_embeds = self.mlp_head(self.lstm_head(input_embeds)[0]) + elif self.encoder_type == PromptEncoderReparameterizationType.MLP: + output_embeds = self.mlp_head(input_embeds) + else: + raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.") + + return output_embeds diff --git a/MoRA/peft_mora/tuners/poly/__init__.py b/MoRA/peft_mora/tuners/poly/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b0f368695edbd7fb7bb3c68d9e918bd16752b873 --- /dev/null +++ b/MoRA/peft_mora/tuners/poly/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import PolyConfig +from .layer import Linear, PolyLayer +from .model import PolyModel + + +__all__ = ["Linear", "PolyConfig", "PolyLayer", "PolyModel"] diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e4e9c210ea53057952e5cae8141e970bb075e1a Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..36a3b7469e7097f2cc60c1e2824934cbfa8f7511 Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/layer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d09270ae7aa7f995cd36a937177701dcd5074ffa Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/layer.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f89a02e5d2513378911820781a29b0d1bb590b91 Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/router.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/router.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..41f0dcb83bae629efb3ffed07de2fb90307b652b Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/router.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/poly/config.py b/MoRA/peft_mora/tuners/poly/config.py new file mode 100644 index 0000000000000000000000000000000000000000..0cdac159f790fce6d6dd8e373e3f277ea01882de --- /dev/null +++ b/MoRA/peft_mora/tuners/poly/config.py @@ -0,0 +1,89 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field +from typing import List, Literal, Optional, Union + +from peft_mora.config import PeftConfig +from peft_mora.utils import PeftType + + +@dataclass +class PolyConfig(PeftConfig): + """ + This is the configuration class to store the configuration of a [`PolyModel`]. + - [Polytropon (Poly)](https://arxiv.org/abs/2202.13914) + - [Multi-Head Routing (MHR)](https://arxiv.org/abs/2211.03831) + + Args: + r (`int`): Attention dimension of each Lora in Poly. + target_modules (`Union[List[str],str]`): The names of the modules to apply Poly to. + modules_to_save (`List[str]`): List of modules apart from Poly layers to be set as trainable + and saved in the final checkpoint. + init_weights (bool): Whether to perform initialization of Poly weights. + poly_type (`Literal["poly"]`): The variant of the Poly module to use. Currently, only "poly" + is supported. + n_tasks (`int`): The number of tasks in a multitasking scenario. + n_skills (`int`): The number of skills (LoRA) in each Poly layer. + n_splits (`int`): The number of splits within each LoRA of a Poly layer. A value greater + than 1 indicates the use of Multi-Head Routing (MHR). + """ + + r: int = field(default=8, metadata={"help": "Lora attention dimension"}) + target_modules: Optional[Union[List[str], str]] = field( + default=None, + metadata={ + "help": "List of module names or regex expression of the module names to replace with Poly." + "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' " + }, + ) + modules_to_save: Optional[List[str]] = field( + default=None, + metadata={ + "help": "List of modules apart from Poly layers to be set as trainable and saved in the final checkpoint. " + "For example, in Sequence Classification or Token Classification tasks, " + "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." + }, + ) + init_weights: bool = field( + default=True, + metadata={ + "help": ( + "Whether to initialize the weights of the Poly layers with their default initialization. Don't change " + "this setting, except if you know exactly what you're doing." + ), + }, + ) + poly_type: Literal["poly"] = field( + default="poly", + metadata={"help": 'Type of Poly modules to be used. Currently only "poly" is supported.'}, + ) + n_tasks: int = field( + default=1, + metadata={"help": "Number of tasks in multitasking scenario."}, + ) + n_skills: int = field( + default=4, + metadata={"help": "Number of skills (LoRA) in each Poly layer."}, + ) + n_splits: int = field( + default=1, + metadata={"help": "Number of splits within each LoRA of a Poly layer."}, + ) + + def __post_init__(self): + self.peft_type = PeftType.POLY + self.target_modules = ( + set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules + ) diff --git a/MoRA/peft_mora/tuners/poly/layer.py b/MoRA/peft_mora/tuners/poly/layer.py new file mode 100644 index 0000000000000000000000000000000000000000..0455c8730f6889be92da7f8b74221d2ca5b8a73e --- /dev/null +++ b/MoRA/peft_mora/tuners/poly/layer.py @@ -0,0 +1,171 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Any + +import torch +import torch.nn as nn + +from peft_mora.tuners.tuners_utils import BaseTunerLayer + +from .config import PolyConfig +from .router import get_router + + +class PolyLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("poly_lora_A", "poly_lora_B", "poly_router") + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("r", "n_tasks", "n_skills", "n_splits") + + def __init__(self, base_layer: nn.Module, **kwargs): + self.base_layer = base_layer + self.r = {} + self.n_tasks = {} + self.n_skills = {} + self.n_splits = {} + self.poly_type = {} + self.poly_router = nn.ModuleDict() + self.poly_lora_A = nn.ParameterDict() + self.poly_lora_B = nn.ParameterDict() + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + else: + raise ValueError(f"Unsupported layer type {type(base_layer)}") + + self.in_features = in_features + self.out_features = out_features + + def update_layer(self, adapter_name, poly_config): + if poly_config.r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {poly_config.r}") + + self.r[adapter_name] = poly_config.r + self.n_tasks[adapter_name] = poly_config.n_tasks + self.n_skills[adapter_name] = poly_config.n_skills + self.n_splits[adapter_name] = poly_config.n_splits + self.poly_type[adapter_name] = poly_config.poly_type + + self.poly_lora_A[adapter_name] = nn.Parameter( + torch.empty( + poly_config.n_splits, + poly_config.n_skills, + self.in_features // poly_config.n_splits, + poly_config.r, + ) + ) + self.poly_lora_B[adapter_name] = nn.Parameter( + torch.empty( + poly_config.n_splits, + poly_config.n_skills, + poly_config.r, + self.out_features // poly_config.n_splits, + ) + ) + self.poly_router[adapter_name] = get_router(poly_config) + + self.reset_poly_parameters(adapter_name, init_weights=poly_config.init_weights) + + weight = getattr(self.get_base_layer(), "weight", None) + if weight is not None: + # the layer is already completely initialized, this is an update + if weight.dtype.is_floating_point or weight.dtype.is_complex: + self.to(weight.device, dtype=weight.dtype) + else: + self.to(weight.device) + self.set_adapter(self.active_adapters) + + def reset_poly_parameters(self, adapter_name, init_weights): + if adapter_name in self.poly_lora_A.keys(): + # initialize A the same way as the default for nn.Linear + # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L269 + n_splits, n_skills, d, r = self.poly_lora_A[adapter_name].shape + for skill in range(n_skills): + for split in range(n_splits): + param = torch.empty((r, d)) + torch.nn.init.kaiming_uniform_(param, a=math.sqrt(5)) + self.poly_lora_A[adapter_name].data[split, skill, :, :] = param.T + + if init_weights: + # initialize B to zero + torch.nn.init.zeros_(self.poly_lora_B[adapter_name]) + else: + # initialize B the same way as the default for nn.Linear + n_splits, n_skills, r, d = self.poly_lora_B[adapter_name].shape + for skill in range(n_skills): + for split in range(n_splits): + param = torch.empty((d, r)) + torch.nn.init.kaiming_uniform_(param, a=math.sqrt(5)) + self.poly_lora_B[adapter_name].data[split, skill, :, :] = param.T + + # initialized router + self.poly_router[adapter_name].reset() + + +class Linear(nn.Module, PolyLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + poly_config: PolyConfig, + **kwargs, + ) -> None: + super().__init__() + PolyLayer.__init__(self, base_layer, **kwargs) + + self._active_adapter = adapter_name + self.update_layer(adapter_name, poly_config) + + def forward(self, x: torch.Tensor, *args: Any, task_ids: torch.Tensor = None, **kwargs: Any) -> torch.Tensor: + previous_dtype = x.dtype + if self.disable_adapters: + result = self.base_layer(x, *args, **kwargs) + else: + result = self.base_layer(x, *args, **kwargs) + for active_adapter in self.active_adapters: + if active_adapter not in self.poly_lora_A.keys(): + continue + + r = self.r[active_adapter] + poly_router = self.poly_router[active_adapter] + poly_lora_A = self.poly_lora_A[active_adapter] + poly_lora_B = self.poly_lora_B[active_adapter] + + # Combine the output of LoRAs + # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L293 + mixing_weights = poly_router(task_ids=task_ids, input_ids=x) + bs, n_splits, n_skills = mixing_weights.size() + + # A is n_splits, n_skills, D // n_splits, rank + # we want bs, n_splits, D // n_splits, rank + A = torch.einsum("bqs,qsdr->bqdr", (mixing_weights, poly_lora_A)) + B = torch.einsum("bqs,qsrd->bqrd", (mixing_weights, poly_lora_B)) + + A = A.reshape(bs, self.in_features, r) + B = B.transpose(1, 2).reshape(bs, r, self.out_features) + + x = x.to(A.dtype) + result += x.bmm(A).bmm(B) / r + + result = result.to(previous_dtype) + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "poly." + rep diff --git a/MoRA/peft_mora/tuners/poly/model.py b/MoRA/peft_mora/tuners/poly/model.py new file mode 100644 index 0000000000000000000000000000000000000000..0a00e98a44623939cbb36a2e4269ceef79a622b2 --- /dev/null +++ b/MoRA/peft_mora/tuners/poly/model.py @@ -0,0 +1,187 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from contextlib import contextmanager +from dataclasses import asdict +from enum import Enum +from typing import Any + +import torch +from torch import nn + +from peft_mora.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists +from peft_mora.utils import ( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + ModulesToSaveWrapper, +) + +from .config import PolyConfig +from .layer import Linear, PolyLayer + + +class PolyModel(BaseTuner): + prefix: str = "poly_" + + def __init__(self, model, config, adapter_name) -> None: + super().__init__(model, config, adapter_name) + + @staticmethod + def _check_target_module_exists(poly_config, key): + return check_target_module_exists(poly_config, key) + + def _create_and_replace( + self, + poly_config: PolyConfig, + adapter_name: str, + target: nn.Module, + target_name: str, + parent: nn.Module, + **optional_kwargs: Any, + ): + if isinstance(target, PolyLayer): + target.update_layer(adapter_name, poly_config) + else: + new_module = self._create_new_module( + poly_config, + adapter_name, + target, + ) + if adapter_name != self.active_adapter: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + def _replace_module(self, parent, child_name, new_module, child): + setattr(parent, child_name, new_module) + # It's not necessary to set requires_grad here, as that is handled by + # _mark_only_adapters_as_trainable + + # child layer wraps the original module, unpack it + if hasattr(child, "base_layer"): + child = child.base_layer + + if not hasattr(new_module, "base_layer"): + new_module.weight = child.weight + if hasattr(child, "bias"): + new_module.bias = child.bias + + if getattr(child, "state", None) is not None: + if hasattr(new_module, "base_layer"): + new_module.base_layer.state = child.state + else: + new_module.state = child.state + new_module.to(child.weight.device) + + # dispatch to correct device + for name, module in new_module.named_modules(): + if (self.prefix in name) or ("ranknum" in name): + weight = child.qweight if hasattr(child, "qweight") else child.weight + module.to(weight.device) + + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + for n, p in model.named_parameters(): + if self.prefix not in n: + p.requires_grad = False + + @staticmethod + def _create_new_module(poly_config, adapter_name, target, **kwargs): + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Linear): + return Linear(target, adapter_name, poly_config, **kwargs) + else: + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`." + ) + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + return getattr(self.model, name) + + def get_peft_config_as_dict(self, inference: bool = False): + config_dict = {} + for key, value in self.peft_config.items(): + config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} + if inference: + config["inference_mode"] = True + config_dict[key] = config + return config + + def _set_adapter_layers(self, enabled=True): + for module in self.model.modules(): + if isinstance(module, (PolyLayer, ModulesToSaveWrapper)): + module.enable_adapters(enabled) + + def enable_adapter_layers(self): + self._set_adapter_layers(enabled=True) + + def disable_adapter_layers(self): + self._set_adapter_layers(enabled=False) + + def set_adapter(self, adapter_name): + for module in self.model.modules(): + if isinstance(module, PolyLayer): + module.set_adapter(adapter_name) + + def _prepare_adapter_config(self, peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING: + raise ValueError("Please specify `target_modules` in `peft_config`") + peft_config.target_modules = set( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]] + ) + return peft_config + + def _register_pre_hooks(self, task_ids): + """Helper method to register pre hooks.""" + if task_ids is None: + return [] + + def pre_hook(_, args, kwargs): + kwargs["task_ids"] = task_ids + return args, kwargs + + handles = [] + + for module in self.model.modules(): + if isinstance(module, Linear): + handle = module.register_forward_pre_hook(pre_hook, with_kwargs=True) + handles.append(handle) + + return handles + + @contextmanager + def _manage_pre_hooks(self, task_ids): + """Context manager to handle the lifecycle of pre hooks.""" + handles = self._register_pre_hooks(task_ids) + try: + yield + finally: + for handle in handles: + handle.remove() + + def forward(self, *args, task_ids=None, **kwargs): + with self._manage_pre_hooks(task_ids): + return self.model(*args, **kwargs) + + def generate(self, *args, task_ids=None, **kwargs): + with self._manage_pre_hooks(task_ids): + return self.model.generate(*args, **kwargs) diff --git a/MoRA/peft_mora/tuners/poly/router.py b/MoRA/peft_mora/tuners/poly/router.py new file mode 100644 index 0000000000000000000000000000000000000000..0249398a9fc36d53bc0b4f022a8410514688a9f1 --- /dev/null +++ b/MoRA/peft_mora/tuners/poly/router.py @@ -0,0 +1,83 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod + +import torch +from torch import nn +from torch.distributions.relaxed_bernoulli import RelaxedBernoulli + +from .config import PolyConfig + + +EPS = 1e-12 + + +def get_router(poly_config: PolyConfig) -> nn.Module: + if poly_config.poly_type == "poly": + return PolyRouter(poly_config) + else: + raise ValueError( + f"Unsupported poly_type: {poly_config.poly_type}. " + "Currently, only the following types are supported: " + "`poly`." + ) + + +class Router(nn.Module, ABC): + @abstractmethod + def reset(self): + ... + + @abstractmethod + def forward(self, task_ids: torch.Tensor, input_ids: torch.Tensor): + ... + + +class PolyRouter(Router): + # It's a simplified implementation of + # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L138 + def __init__(self, poly_config: PolyConfig): + super().__init__() + + self.poly_type = poly_config.poly_type + self.n_tasks = poly_config.n_tasks + self.n_skills = poly_config.n_skills + self.n_splits = poly_config.n_splits + + self.module_logits = nn.Parameter(torch.empty((self.n_tasks, self.n_splits * self.n_skills))) + + def reset(self): + torch.nn.init.uniform_(self.module_logits, -1e-3, 1e-3) + + def forward(self, task_ids: torch.Tensor, input_ids: torch.Tensor): + if task_ids is None: + raise ValueError("task_ids should not be None.") + if task_ids.max().item() >= self.n_tasks: + raise ValueError(f"Only {self.n_tasks} tasks available. Found task id = {task_ids.max().item()}") + + # move task id to input's device + task_ids = task_ids.to(self.module_logits.device) + + module_logits = self.module_logits[task_ids] + module_logits = module_logits.view(-1, self.n_splits, self.n_skills) + + if self.training: + module_logits = RelaxedBernoulli(temperature=1.0, logits=module_logits).rsample() + else: + module_logits = torch.sigmoid(module_logits) + + module_weights = module_logits / (module_logits.sum(dim=-1, keepdim=True) + EPS) + + return module_weights diff --git a/MoRA/peft_mora/tuners/prefix_tuning/__init__.py b/MoRA/peft_mora/tuners/prefix_tuning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..28f4bedbb43bcf2b22146d60e0e1f2fe7b19d9eb --- /dev/null +++ b/MoRA/peft_mora/tuners/prefix_tuning/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import PrefixTuningConfig +from .model import PrefixEncoder + + +__all__ = ["PrefixTuningConfig", "PrefixEncoder"] diff --git a/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..60cf0f0394986f800330ad85d41614acc5b46346 Binary files /dev/null and b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fc20711f87961d6e34cf655396f247d65364b03 Binary files /dev/null and b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..314c191115a1a8f766e220fc2e924c4922b5eb31 Binary files /dev/null and b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/prefix_tuning/config.py b/MoRA/peft_mora/tuners/prefix_tuning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..24ea1ebcb5db96177d39c2277c77f1e839df4703 --- /dev/null +++ b/MoRA/peft_mora/tuners/prefix_tuning/config.py @@ -0,0 +1,41 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass, field + +from peft_mora.config import PromptLearningConfig +from peft_mora.utils import PeftType + + +@dataclass +class PrefixTuningConfig(PromptLearningConfig): + """ + This is the configuration class to store the configuration of a [`PrefixEncoder`]. + + Args: + encoder_hidden_size (`int`): The hidden size of the prompt encoder. + prefix_projection (`bool`): Whether to project the prefix embeddings. + """ + + encoder_hidden_size: int = field( + default=None, + metadata={"help": "The hidden size of the encoder"}, + ) + prefix_projection: bool = field( + default=False, + metadata={"help": "Whether to project the prefix tokens"}, + ) + + def __post_init__(self): + self.peft_type = PeftType.PREFIX_TUNING diff --git a/MoRA/peft_mora/tuners/prefix_tuning/model.py b/MoRA/peft_mora/tuners/prefix_tuning/model.py new file mode 100644 index 0000000000000000000000000000000000000000..ffd51892a3cc074406791f6bc7d1b088d25148e3 --- /dev/null +++ b/MoRA/peft_mora/tuners/prefix_tuning/model.py @@ -0,0 +1,80 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Based on https://github.com/THUDM/P-tuning-v2/blob/main/model/prefix_encoder.py +# with some refactor +import torch + + +class PrefixEncoder(torch.nn.Module): + r""" + The `torch.nn` model to encode the prefix. + + Args: + config ([`PrefixTuningConfig`]): The configuration of the prefix encoder. + + Example: + + ```py + >>> from peft import PrefixEncoder, PrefixTuningConfig + + >>> config = PrefixTuningConfig( + ... peft_type="PREFIX_TUNING", + ... task_type="SEQ_2_SEQ_LM", + ... num_virtual_tokens=20, + ... token_dim=768, + ... num_transformer_submodules=1, + ... num_attention_heads=12, + ... num_layers=12, + ... encoder_hidden_size=768, + ... ) + >>> prefix_encoder = PrefixEncoder(config) + ``` + + **Attributes**: + - **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prefix encoder. + - **transform** (`torch.nn.Sequential`) -- The two-layer MLP to transform the prefix embeddings if + `prefix_projection` is `True`. + - **prefix_projection** (`bool`) -- Whether to project the prefix embeddings. + + Input shape: (`batch_size`, `num_virtual_tokens`) + + Output shape: (`batch_size`, `num_virtual_tokens`, `2*layers*hidden`) + """ + + def __init__(self, config): + super().__init__() + self.prefix_projection = config.prefix_projection + token_dim = config.token_dim + num_layers = config.num_layers + encoder_hidden_size = config.encoder_hidden_size + num_virtual_tokens = config.num_virtual_tokens + if self.prefix_projection and not config.inference_mode: + # Use a two-layer MLP to encode the prefix + self.embedding = torch.nn.Embedding(num_virtual_tokens, token_dim) + self.transform = torch.nn.Sequential( + torch.nn.Linear(token_dim, encoder_hidden_size), + torch.nn.Tanh(), + torch.nn.Linear(encoder_hidden_size, num_layers * 2 * token_dim), + ) + else: + self.embedding = torch.nn.Embedding(num_virtual_tokens, num_layers * 2 * token_dim) + + def forward(self, prefix: torch.Tensor): + if self.prefix_projection: + prefix_tokens = self.embedding(prefix) + past_key_values = self.transform(prefix_tokens) + else: + past_key_values = self.embedding(prefix) + return past_key_values diff --git a/MoRA/peft_mora/tuners/prompt_tuning/__init__.py b/MoRA/peft_mora/tuners/prompt_tuning/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..71795b61d819573ff41770e6d49c750e6c51b0ae --- /dev/null +++ b/MoRA/peft_mora/tuners/prompt_tuning/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .config import PromptTuningConfig, PromptTuningInit +from .model import PromptEmbedding + + +__all__ = ["PromptTuningConfig", "PromptEmbedding", "PromptTuningInit"] diff --git a/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70ef65eb7be3dc096910e28bd2c39f3919aacc4e Binary files /dev/null and b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/config.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..388b3710f0c9b819ddcfbbcf0651026e0e980dfb Binary files /dev/null and b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/config.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/model.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fe6c76031de3e9b2706dee5fb2891775bdf901c6 Binary files /dev/null and b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/model.cpython-312.pyc differ diff --git a/MoRA/peft_mora/tuners/prompt_tuning/config.py b/MoRA/peft_mora/tuners/prompt_tuning/config.py new file mode 100644 index 0000000000000000000000000000000000000000..cf5e2303ac4ecfb9bf2befbc0d21b00a18bea663 --- /dev/null +++ b/MoRA/peft_mora/tuners/prompt_tuning/config.py @@ -0,0 +1,77 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +from dataclasses import dataclass, field +from typing import Optional, Union + +from peft_mora.config import PromptLearningConfig +from peft_mora.utils import PeftType + + +class PromptTuningInit(str, enum.Enum): + TEXT = "TEXT" + RANDOM = "RANDOM" + + +@dataclass +class PromptTuningConfig(PromptLearningConfig): + """ + This is the configuration class to store the configuration of a [`PromptEmbedding`]. + + Args: + prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): The initialization of the prompt embedding. + prompt_tuning_init_text (`str`, *optional*): + The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`. + tokenizer_name_or_path (`str`, *optional*): + The name or path of the tokenizer. Only used if `prompt_tuning_init` is `TEXT`. + tokenizer_kwargs (`dict`, *optional*): + The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if `prompt_tuning_init` is + `TEXT`. + """ + + prompt_tuning_init: Union[PromptTuningInit, str] = field( + default=PromptTuningInit.RANDOM, + metadata={"help": "How to initialize the prompt tuning parameters"}, + ) + prompt_tuning_init_text: Optional[str] = field( + default=None, + metadata={ + "help": "The text to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`" + }, + ) + tokenizer_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The tokenizer to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`" + }, + ) + + tokenizer_kwargs: Optional[dict] = field( + default=None, + metadata={ + "help": ( + "The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if prompt_tuning_init is " + "`TEXT`" + ), + }, + ) + + def __post_init__(self): + self.peft_type = PeftType.PROMPT_TUNING + + if self.tokenizer_kwargs and (self.prompt_tuning_init != PromptTuningInit.TEXT): + raise ValueError( + f"tokenizer_kwargs only valid when using prompt_tuning_init='{PromptTuningInit.TEXT.value}'." + ) diff --git a/MoRA/peft_mora/tuners/prompt_tuning/model.py b/MoRA/peft_mora/tuners/prompt_tuning/model.py new file mode 100644 index 0000000000000000000000000000000000000000..a04221c2abfd1fb806df2805a7a28e4e3073a32d --- /dev/null +++ b/MoRA/peft_mora/tuners/prompt_tuning/model.py @@ -0,0 +1,89 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import torch + +from .config import PromptTuningInit + + +class PromptEmbedding(torch.nn.Module): + """ + The model to encode virtual tokens into prompt embeddings. + + Args: + config ([`PromptTuningConfig`]): The configuration of the prompt embedding. + word_embeddings (`torch.nn.Module`): The word embeddings of the base transformer model. + + **Attributes**: + - **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prompt embedding. + + Example: + + ```py + >>> from peft import PromptEmbedding, PromptTuningConfig + + >>> config = PromptTuningConfig( + ... peft_type="PROMPT_TUNING", + ... task_type="SEQ_2_SEQ_LM", + ... num_virtual_tokens=20, + ... token_dim=768, + ... num_transformer_submodules=1, + ... num_attention_heads=12, + ... num_layers=12, + ... prompt_tuning_init="TEXT", + ... prompt_tuning_init_text="Predict if sentiment of this review is positive, negative or neutral", + ... tokenizer_name_or_path="t5-base", + ... ) + + >>> # t5_model.shared is the word embeddings of the base model + >>> prompt_embedding = PromptEmbedding(config, t5_model.shared) + ``` + + Input Shape: (`batch_size`, `total_virtual_tokens`) + + Output Shape: (`batch_size`, `total_virtual_tokens`, `token_dim`) + """ + + def __init__(self, config, word_embeddings): + super().__init__() + + total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules + self.embedding = torch.nn.Embedding(total_virtual_tokens, config.token_dim) + if config.prompt_tuning_init == PromptTuningInit.TEXT and not config.inference_mode: + from transformers import AutoTokenizer + + tokenizer_kwargs = config.tokenizer_kwargs or {} + tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name_or_path, **tokenizer_kwargs) + init_text = config.prompt_tuning_init_text + init_token_ids = tokenizer(init_text)["input_ids"] + # Trim or iterate until num_text_tokens matches total_virtual_tokens + num_text_tokens = len(init_token_ids) + if num_text_tokens > total_virtual_tokens: + init_token_ids = init_token_ids[:total_virtual_tokens] + elif num_text_tokens < total_virtual_tokens: + num_reps = math.ceil(total_virtual_tokens / num_text_tokens) + init_token_ids = init_token_ids * num_reps + init_token_ids = init_token_ids[:total_virtual_tokens] + init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device) + + word_embedding_weights = word_embeddings(init_token_ids).detach().clone() + word_embedding_weights = word_embedding_weights.to(torch.float32) + self.embedding.weight = torch.nn.Parameter(word_embedding_weights) + + def forward(self, indices): + # Just get embeddings + prompt_embeddings = self.embedding(indices) + return prompt_embeddings diff --git a/MoRA/peft_mora/tuners/tuners_utils.py b/MoRA/peft_mora/tuners/tuners_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5f0437dbc1699178cc41af91281b5352912701dc --- /dev/null +++ b/MoRA/peft_mora/tuners/tuners_utils.py @@ -0,0 +1,667 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import logging +import re +import warnings +from abc import ABC, abstractmethod +from contextlib import contextmanager +from typing import Any, Optional, Union + +import torch +from accelerate.hooks import AlignDevicesHook +from accelerate.utils import named_module_tensors, offload_state_dict +from torch import nn +from transformers import PreTrainedModel +from transformers.pytorch_utils import Conv1D + +from peft_mora.utils import INCLUDE_LINEAR_LAYERS_SHORTHAND + +from ..config import PeftConfig +from ..utils import ModulesToSaveWrapper, _get_submodules + + +logger = logging.getLogger(__name__) + + +@contextmanager +def onload_layer(layer): + r""" + A utility for modifying a module containing one or more tuners and a base layer, any of which are offloaded to the + CPU or disk. Moves a module's sub-modules to the execution device before some action is performed, after that the + base layer state dictionary is re-assigned (if that layer was offloaded to the disk) and finally the parameters are + offloaded. + + If the module has no offloaded sub-modules, this function does nothing. + + Args: + layer ('torch.nn.Module'): + layer with tuners to be merged + """ + + offloaded_modules = [] + for name, module in layer.named_modules(): + if name in ["", "base_layer"]: + continue + if hasattr(module, "_hf_hook") and isinstance(module._hf_hook, AlignDevicesHook) and module._hf_hook.offload: + module._hf_hook.pre_forward(module) + offloaded_modules.append(module) + + base_layer_offload = False + if hasattr(layer, "base_layer") and ( + hasattr(layer.base_layer, "_hf_hook") + and isinstance(layer.base_layer._hf_hook, AlignDevicesHook) + and layer.base_layer._hf_hook.offload + ): + if torch.device("meta") in layer.base_layer._hf_hook.original_devices.values(): + # retrieve the name of the original disk-offload directory + offload_folder = layer.base_layer._hf_hook.weights_map.dataset.save_folder + layer.base_layer._hf_hook.pre_forward(layer.base_layer) + base_layer_offload = True + + yield + + for module in offloaded_modules: + module._hf_hook.post_forward(module, torch.tensor([])) + + if base_layer_offload: + # re-make weights map (must be on cpu to send params to the disk via memmap if disk offload) + layer.base_layer._hf_hook.weights_map = { + name: param.to("cpu") for name, param in named_module_tensors(layer.base_layer) + } + # offload weights map to disk if original device is the disk + if torch.device("meta") in layer.base_layer._hf_hook.original_devices.values(): + # rewrite directory with merged weights + offload_state_dict(offload_folder, layer.base_layer._hf_hook.weights_map) + layer.base_layer._hf_hook.post_forward(layer.base_layer, torch.tensor([])) + + +class BaseTuner(nn.Module, ABC): + r""" + A base tuner model that provides the common methods and attributes for all tuners that are injectable into a + torch.nn.Module + + For adding a new Tuner class, one needs to overwrite the following methods: + + - **_prepare_adapter_config**: + A private method to eventually prepare the adapter config, for example in case the field `target_modules` is + missing. + - **_create_and_replace**: + A private method to create and replace the target module with the adapter module. + - **_check_target_module_exists**: + A private helper method to check if the passed module's key name matches any of the target modules in the + adapter_config. + + The easiest is to check what is done in the `peft.tuners.lora.LoraModel` class. + + Attributes: + model (`torch.nn.Module`): + The model to which the adapter tuner layers will be attached. + forward (`Callable`): + The forward method of the model. + peft_config (`Union[`PeftConfig`, dict[str, PeftConfig]]`): + The adapter configuration object, it should be a dictionary of `str` to `PeftConfig` objects. One can also + pass a PeftConfig object and a new adapter will be created with the default name `adapter` or create a new + dictionary with a key `adapter_name` and a value of that peft config. + config (`dict[str, Any]`): + The model configuration object, it should be a dictionary of `str` to `Any` objects. + targeted_module_names (`list[str]`): + The list of module names that were actually adapted. Can be useful to inspect if you want to quickly + double-check that the `config.target_modules` where specified correctly. + """ + + def __init__(self, model, peft_config: Union[PeftConfig, dict[str, PeftConfig]], adapter_name: str) -> None: + super().__init__() + + self.model = model + self.targeted_module_names: list[str] = [] + + # For advanced developers, if you want to attach multiple adapters to your + # model, just add a `peft_config` dict attribute to your model. + if not hasattr(self, "peft_config"): + self.peft_config = {adapter_name: peft_config} if isinstance(peft_config, PeftConfig) else peft_config + else: + logger.info( + "Already found a `peft_config` attribute in the model. This will lead to having multiple adapters" + " in the model. Make sure to know what you are doing!" + ) + if isinstance(peft_config, PeftConfig): + self.peft_config[adapter_name] = peft_config + else: + # user is adding a dict of PeftConfigs + self.peft_config.update(peft_config) + + self.active_adapter = adapter_name + self.inject_adapter(self.model, adapter_name) + + # Copy the peft_config in the injected model. + self.model.peft_config = self.peft_config + + @property + def active_adapters(self) -> list[str]: + if isinstance(self.active_adapter, str): + return [self.active_adapter] + # is already a list of str + return self.active_adapter + + def forward(self, *args: Any, **kwargs: Any): + return self.model.forward(*args, **kwargs) + + @abstractmethod + def _prepare_adapter_config(self, peft_config: PeftConfig, model_config: dict) -> PeftConfig: + r""" + A private method to eventually prepare the adapter config. For transformers based models, if + `peft_config.target_modules` is None, we can automatically infer the target modules from the + `TRANSFORMERS_MODELS_TO_XXX_TARGET_MODULES_MAPPING`. This method can be further refactored in the future to + automatically infer it for all tuner models. + + Check out `peft.tuner.lora.LoraModel._prepare_adapter_config` for an example. + + Args: + peft_config (`str`): + The adapter config. + model_config (`str`): + The transformers model config, that config should contain the `model_type` key. + """ + ... + + @abstractmethod + def _check_target_module_exists(peft_config: PeftConfig, key: str) -> bool: + r""" + A helper private method to check if the passed module's key name matches any of the target modules in the + `peft_config.target_modules` list. If it does, return `True`, else return `False`. + + Args: + peft_config (`PeftConfig`): + The adapter config. + key (`str`): + The module's key name. + """ + ... + + @abstractmethod + def _create_and_replace( + self, + peft_config: PeftConfig, + adapter_name: str, + target: nn.Module, + target_name: str, + parent: nn.Module, + current_key: str, + ) -> None: + r""" + Inplace replacement of the target module with the adapter layer. This method needs to be overridden by all the + tuner classes. + + Check `peft.tuners.lora.LoraModel._create_and_replace` for an example. + + Args: + peft_config (`PeftConfig`): + The adapter config. + adapter_name (`str`): + The adapter name. + target (`nn.Module`): + The target module. + target_name (`str`): + The target module's name. + parent (`nn.Module`): + The parent module. + current_key (`str`): + The key of the current target being adapted. + """ + ... + + @abstractmethod + def _mark_only_adapters_as_trainable(self, model: nn.Module): + r""" + A helper method to mark only the adapter layers as trainable (i.e. module.requires_grad = False) This needs to + be overridden for all tuner classes to match the correct key names. + + Check `peft.tuners.lora.LoraModel._mark_only_adapters_as_trainable` for an example. + """ + ... + + def _check_new_adapter_config(self, config: PeftConfig) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + pass + + def inject_adapter(self, model: nn.Module, adapter_name: str): + r""" + Creates adapter layers and replaces the target modules with the adapter layers. This method is called under the + hood by `peft.mapping.get_peft_model` if a non-prompt tuning adapter class is passed. + + The corresponding PEFT config is directly retrieved from the `peft_config` attribute of the BaseTuner class. + + Args: + model (`nn.Module`): + The model to be tuned. + adapter_name (`str`): + The adapter name. + """ + peft_config = self.peft_config[adapter_name] + # Note: If possible, all checks should be performed *at the start of this method*. + # This way, we can raise early if something goes wrong, without leaving the model + # in a bad (half-initialized) state. + self._check_new_adapter_config(peft_config) + + is_target_modules_in_base_model = False + key_list = [key for key, _ in model.named_modules()] + + _check_for_modules_to_save = getattr(peft_config, "modules_to_save", None) is not None + _has_modules_to_save = False + + model_config = getattr(model, "config", {"model_type": "custom"}) + if hasattr(model_config, "to_dict"): + model_config = model_config.to_dict() + + peft_config = self._prepare_adapter_config(peft_config, model_config) + + # update peft_config.target_modules if required + peft_config = _maybe_include_all_linear_layers(peft_config, model) + + for key in key_list: + # Check for modules_to_save in case + if _check_for_modules_to_save and any( + key.endswith(f"{module_to_save}") for module_to_save in peft_config.modules_to_save + ): + # Optionally set the modules to save + parent, target, target_name = _get_submodules(model, key) + + if not isinstance(target, ModulesToSaveWrapper): + new_module = ModulesToSaveWrapper(target, adapter_name) + setattr(parent, target_name, new_module) + else: + target.update(adapter_name) + + _has_modules_to_save = True + continue + + if not self._check_target_module_exists(peft_config, key): + continue + + self.targeted_module_names.append(key) + is_target_modules_in_base_model = True + parent, target, target_name = _get_submodules(model, key) + self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key) + + if not is_target_modules_in_base_model: + raise ValueError( + f"Target modules {peft_config.target_modules} not found in the base model. " + f"Please check the target modules and try again." + ) + + self._mark_only_adapters_as_trainable(model) + + if self.peft_config[adapter_name].inference_mode: + for n, p in model.named_parameters(): + if adapter_name in n: + p.requires_grad = False + + if _has_modules_to_save: + if not hasattr(model, "modules_to_save"): + model.modules_to_save = set(peft_config.modules_to_save) + else: + model.modules_to_save.update(set(peft_config.modules_to_save)) + + def merge_adapter(self, adapter_names: Optional[list[str]] = None) -> None: + """ + This method merges the adapter layers into the base model. + + Merging adapters can lead to a speed up of the forward pass. A copy of the adapter weights is still kept in + memory, which is required to unmerge the adapters. In order to merge the adapter weights without keeping them + in memory, please call `merge_and_unload`. + + Args: + safe_merge (`bool`, *optional*): + If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs + before merging the weights. This is useful if you want to check if the merge operation will produce + NaNs. Defaults to `False`. + adapter_names (`list[str]`, *optional*): + The list of adapter names that should be merged. If `None`, all active adapters will be merged. + Defaults to `None`. + """ + for module in self.model.modules(): + if isinstance(module, BaseTunerLayer): + with onload_layer(module): + module.merge(adapter_names=adapter_names) + + def unmerge_adapter(self): + """ + This method unmerges all merged adapter layers from the base model. + """ + for module in self.model.modules(): + if isinstance(module, BaseTunerLayer): + with onload_layer(module): + module.unmerge() + + def _unloading_checks(self, adapter_names: Optional[list[str]]): + adapters_to_consider = adapter_names or self.active_adapters + is_modules_to_save_available = any( + self.peft_config[adapter].modules_to_save for adapter in adapters_to_consider + ) + if is_modules_to_save_available and len(adapters_to_consider) > 1: + raise ValueError("Cannot unload multiple adapters that specify `modules_to_save`.") + + +class BaseTunerLayer(ABC): + r""" + A tuner layer mixin that provides the common methods and attributes for all tuners. + + Args: + is_pluggable (`bool`, *optional*): + Whether the adapter layer can be plugged to any pytorch module + active_adapters (Union[List[`str`], `str`], *optional*): + The name of the active adapter. + """ + + active_adapter = None + + # All names of layers that may contain adapter (trainable) weights + adapter_layer_names: tuple[str] = () + # All names of other parameters that may contain adapter-related parameters + other_param_names: tuple[str] = () + + # indicates whether all adapters should be disabled + _disable_adapters: bool = False + + # the currently active adapter(s) + _active_adapter: str | list[str] = "default" + + # List all merged adapters + merged_adapters: list[str] = [] + + def get_base_layer(self) -> nn.Module: + """ + (Recursively) get the base_layer. + + This is necessary for the case that the tuner layer wraps another tuner layer. + + """ + base_layer = self + while hasattr(base_layer, "base_layer"): + base_layer = base_layer.base_layer + return base_layer + + @property + def weight(self) -> torch.Tensor: + # This is required for some transformers code, e.g. for T5, weight is accessed as: + # self.wo.weight + # where "wo" is the adapter layer. + # https://github.com/huggingface/transformers/blob/78f6ed6c70b29c1560780e3869a7ad4c6b3d2710/src/transformers + # /models/t5/modeling_t5.py#L292 + base_layer = self.get_base_layer() + if hasattr(base_layer, "qweight"): + # QuantLinear + weight = base_layer.qweight + else: + # Other layers + weight = base_layer.weight + return weight + + def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + raise NotImplementedError + + def unmerge(self) -> None: + raise NotImplementedError + + @property + def merged(self) -> bool: + return bool(self.merged_adapters) + + @property + def disable_adapters(self) -> bool: + # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method + return self._disable_adapters + + @property + def active_adapter(self) -> str: + # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method + return self._active_adapter + + @property + def active_adapters(self): + if isinstance(self.active_adapter, str): + return [self.active_adapter] + # is already a list of str + return self.active_adapter + + def enable_adapters(self, enabled: bool) -> None: + """Toggle the enabling and disabling of adapters + + Takes care of setting the requires_grad flag for the adapter weights. + + Args: + enabled (bool): True to enable adapters, False to disable adapters + """ + if enabled: + self.set_adapter(self.active_adapters) + self._disable_adapters = False + else: + # disable grads on all adapter layers + for layer_name in self.adapter_layer_names: + layer = getattr(self, layer_name) + layer.requires_grad_(False) + self._disable_adapters = True + + def set_adapter(self, adapter_names: str | list[str]) -> None: + """Set the active adapter(s). + + Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (`str` or `List[str]`): Name of the adapter(s) to be activated. + """ + if isinstance(adapter_names, str): + adapter_names = [adapter_names] + + # Deactivate grads on the inactive adapter and activate grads on the active adapter + for layer_name in self.adapter_layer_names: + module_dict = getattr(self, layer_name) + for key, layer in module_dict.items(): + if key in adapter_names: + # Note: It is possible that not a single layer is called with requires_grad_(True) here. This may + # happen if a completely different adapter layer is being activated. + layer.requires_grad_(True) + else: + layer.requires_grad_(False) + + self._active_adapter = adapter_names + + def _all_available_adapter_names(self) -> list[str]: + """Return a sorted list of all available adapter names""" + adapter_names = set() + for name in self.adapter_layer_names + self.other_param_names: + # we check each possible attribute and if it's a dict or ModuleDict, we assume that the keys are the adapter + # names + attr = getattr(self, name) + if hasattr(attr, "keys"): + adapter_names.update(attr.keys()) + return sorted(adapter_names) + + def delete_adapter(self, adapter_name: str) -> None: + """ + Delete an adapter from the layer + + This should be called on all adapter layers, or else we will get an inconsistent state. + + This method will also set a new active adapter if the deleted adapter was an active adapter. It is important + that the new adapter is chosen in a deterministic way, so that the same adapter is chosen on all layers. + + Args: + adapter_name (`str`): The name of the adapter to delete + + """ + for attr in self.adapter_layer_names + self.other_param_names: + if adapter_name in getattr(self, attr): + del getattr(self, attr)[adapter_name] + + if adapter_name in self.active_adapters: + # choose a new active adapter + active_adapters = self.active_adapters[:] + active_adapters.remove(adapter_name) + if active_adapters: + self.set_adapter(active_adapters) + else: + # no active adapters left, set a new default adapter + # here we get the list of all adapters existing adapter names and choose the first one + remaining_adapters = self._all_available_adapter_names() + if not remaining_adapters: + self.set_adapter([]) + else: + new_active_adapter = remaining_adapters[0] + warnings.warn( + f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to " + f"{new_active_adapter}." + ) + self.set_adapter(remaining_adapters[0]) + + +def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None: + """A helper method to check if the passed module's key name matches any of the target modules in the adapter_config. + + Args: + config (`LoraConfig` | `LycorisConfig`): A config to match target modules from + key (`str`): A key to search any matches in config + + Returns: + `bool` | `re.Match[str]` | `None`: True of match object if key matches any target modules from config, False or + None if no match found + """ + if isinstance(config.target_modules, str): + target_module_found = re.fullmatch(config.target_modules, key) + elif key in config.target_modules: + # this module is specified directly in target_modules + target_module_found = True + else: + target_module_found = any(key.endswith(f".{target_key}") for target_key in config.target_modules) + + layer_indexes = getattr(config, "layers_to_transform", None) + layers_pattern = getattr(config, "layers_pattern", None) + + is_using_layer_indexes = layer_indexes is not None and ( + len(layer_indexes) != 0 if isinstance(layer_indexes, list) else True + ) + if is_using_layer_indexes and target_module_found: + layer_index = None + # TODO: It's still unclear how empty layers_pattern (None, [], or "") should behave + # For now, empty layers_pattern means any layer pattern is ok + if layers_pattern is None or len(layers_pattern) == 0: + layer_index = re.match(r".*\.[^.]*\.(\d+)\.", key) + else: + layers_pattern = [layers_pattern] if isinstance(layers_pattern, str) else layers_pattern + for pattern in layers_pattern: + layer_index = re.match(rf".*\.{pattern}\.(\d+)\.", key) + if layer_index is not None: + break + + if layer_index is None: + target_module_found = False + else: + layer_index = int(layer_index.group(1)) + if isinstance(layer_indexes, int): + target_module_found = layer_index == layer_indexes + else: + target_module_found = layer_index in layer_indexes + + return target_module_found + + +def inspect_matched_modules(tuner: BaseTuner, adapter_name: str = "default") -> dict: + """ + A helper function to inspect the set of matched and unmatched modules for a PEFT model and the given adapter. + """ + config = tuner.peft_config[adapter_name] + key_list = [key for key, _ in tuner.model.named_modules()] + module_dict = {"matched": [], "unmatched": []} + for key in key_list: + if tuner._check_target_module_exists(config, key): + module_dict["matched"].append(key) + else: + module_dict["unmatched"].append(key) + return module_dict + + +def _maybe_include_all_linear_layers(peft_config: PeftConfig, model: nn.Module) -> PeftConfig: + """ + Helper function to update `target_modules` to all linear/Conv1D layers if provided as 'all-linear'. Adapted from + the QLoRA repository: https://github.com/artidoro/qlora/blob/main/qlora.py + """ + + # if `target_modules` is a string, convert to lower case and check if it matches "all-linear" + if not ( + isinstance(peft_config.target_modules, str) + and peft_config.target_modules.lower() == INCLUDE_LINEAR_LAYERS_SHORTHAND + ): + return peft_config + + if not isinstance(model, PreTrainedModel): + raise ValueError( + f"Only instances of PreTrainedModel support `target_modules={INCLUDE_LINEAR_LAYERS_SHORTHAND!r}`" + ) + + linear_classes = (torch.nn.Linear, Conv1D) + + linear_module_names = set() + for name, module in model.named_modules(): + # match with all linear classes. + if isinstance(module, linear_classes): + names = name.rsplit(".", 1)[-1] # get the base name + linear_module_names.add(names) + + # ignore the last classification head for text generation models + output_emb = model.get_output_embeddings() + if output_emb is not None: + last_module_name = [name for name, module in model.named_modules() if module is output_emb][0] + linear_module_names -= {last_module_name} + peft_config.target_modules = linear_module_names + return peft_config + + +def check_adapters_to_merge(module: BaseTunerLayer, adapter_names: Optional[list[str]] = None) -> list[str]: + """ + Helper function to check which adapters should be merged. + + Only return those adapters that are not already merged. Give a warning if some or all of the adapters are already + merged. + + """ + if adapter_names is None: + adapter_names = module.active_adapters + + if module.merged: + merged_adapters = set(module.merged_adapters) + adapter_names = [name for name in adapter_names if name not in merged_adapters] + + if adapter_names: + warnings.warn( + f"Already following adapters were merged {','.join(module.merged_adapters)}. " + f"You are now additionally merging {','.join(adapter_names)}." + ) + else: + warnings.warn("All adapters are already merged, nothing to do.") + + return adapter_names diff --git a/MoRA/peft_mora/utils/__init__.py b/MoRA/peft_mora/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dbd48de9c7beb40ee0570907353c4d1c1c109717 --- /dev/null +++ b/MoRA/peft_mora/utils/__init__.py @@ -0,0 +1,51 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all + +# coding=utf-8 +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# from .config import PeftConfig, PeftType, PromptLearningConfig, TaskType +from .peft_types import PeftType, TaskType +from .other import ( + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, + CONFIG_NAME, + WEIGHTS_NAME, + SAFETENSORS_WEIGHTS_NAME, + INCLUDE_LINEAR_LAYERS_SHORTHAND, + _set_trainable, + bloom_model_postprocess_past_key_value, + prepare_model_for_int8_training, + prepare_model_for_kbit_training, + shift_tokens_right, + transpose, + _get_batch_size, + _get_submodules, + _set_adapter, + _freeze_adapter, + ModulesToSaveWrapper, + _prepare_prompt_learning_config, + _is_valid_match, + infer_device, + get_auto_gptq_quant_linear, + get_quantization_config, + id_tensor_storage, + cast_mixed_precision_params, +) +from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict, load_peft_weights diff --git a/MoRA/peft_mora/utils/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ca0a5636a270e534b70a70cba2f186d707a24471 Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/__init__.cpython-312.pyc differ diff --git a/MoRA/peft_mora/utils/__pycache__/constants.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/constants.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4da0c1fbe2e05f36e21caa508baa54ad22de457 Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/constants.cpython-312.pyc differ diff --git a/MoRA/peft_mora/utils/__pycache__/integrations.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/integrations.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b20211e11e77879dc6329c69d520f3ba8707c806 Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/integrations.cpython-312.pyc differ diff --git a/MoRA/peft_mora/utils/__pycache__/merge_utils.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/merge_utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c89d7f523a44271ef34f1ffed601a763b6ac313d Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/merge_utils.cpython-312.pyc differ diff --git a/MoRA/peft_mora/utils/__pycache__/other.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/other.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d6109f8e7c347da53a2eec058ef3cef1025d5b2 Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/other.cpython-312.pyc differ diff --git a/MoRA/peft_mora/utils/__pycache__/peft_types.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/peft_types.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..09c30b269ef49c25dcc5b664609a2b74db2d7fbb Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/peft_types.cpython-312.pyc differ diff --git a/MoRA/peft_mora/utils/__pycache__/save_and_load.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/save_and_load.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b7541849e51a3b1b9f452d900aa6109f6e4b9d98 Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/save_and_load.cpython-312.pyc differ diff --git a/MoRA/peft_mora/utils/constants.py b/MoRA/peft_mora/utils/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..41047431cef3c18c603b4a331b791cbf00f4cdf2 --- /dev/null +++ b/MoRA/peft_mora/utils/constants.py @@ -0,0 +1,158 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + + +# needed for prefix-tuning of bloom model +def bloom_model_postprocess_past_key_value(past_key_values): + past_key_values = torch.cat(past_key_values) + total_layers, batch_size, num_attention_heads, num_virtual_tokens, head_dim = past_key_values.shape + keys = past_key_values[: total_layers // 2] + keys = keys.transpose(2, 3).reshape( + total_layers // 2, batch_size * num_attention_heads, head_dim, num_virtual_tokens + ) + values = past_key_values[total_layers // 2 :] + values = values.reshape(total_layers // 2, batch_size * num_attention_heads, num_virtual_tokens, head_dim) + + return tuple(zip(keys, values)) + + +# needed for prefix-tuning of StarCoder models +def starcoder_model_postprocess_past_key_value(past_key_values): + result = [] + for k in past_key_values: + k = k[:, :, 0] + k = k.permute([1, 2, 0, 3]) + k = k.reshape(*k.shape[:-2], -1) + result.append(k) + return tuple(result) + + +TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = { + "bloom": bloom_model_postprocess_past_key_value, + "gpt_bigcode": starcoder_model_postprocess_past_key_value, +} + + +TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = { + "t5": ["q", "v"], + "mt5": ["q", "v"], + "bart": ["q_proj", "v_proj"], + "gpt2": ["c_attn"], + "bloom": ["query_key_value"], + "blip-2": ["q", "v", "q_proj", "v_proj"], + "opt": ["q_proj", "v_proj"], + "gptj": ["q_proj", "v_proj"], + "gpt_neox": ["query_key_value"], + "gpt_neo": ["q_proj", "v_proj"], + "bert": ["query", "value"], + "roberta": ["query", "value"], + "xlm-roberta": ["query", "value"], + "electra": ["query", "value"], + "deberta-v2": ["query_proj", "value_proj"], + "deberta": ["in_proj"], + "layoutlm": ["query", "value"], + "llama": ["q_proj", "v_proj"], + "chatglm": ["query_key_value"], + "gpt_bigcode": ["c_attn"], + "mpt": ["Wqkv"], + "RefinedWebModel": ["query_key_value"], + "RefinedWeb": ["query_key_value"], + "falcon": ["query_key_value"], + "btlm": ["c_proj", "c_attn"], + "codegen": ["qkv_proj"], + "mistral": ["q_proj", "v_proj"], + "mixtral": ["q_proj", "v_proj"], + "stablelm": ["q_proj", "v_proj"], + "phi": ["q_proj", "v_proj", "fc1", "fc2"], + "gemma": ["q_proj", "v_proj"], +} + +TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING = { + "t5": ["k", "v", "wo"], + "mt5": ["k", "v", "wi_1"], + "gpt2": ["c_attn", "mlp.c_proj"], + "bloom": ["query_key_value", "mlp.dense_4h_to_h"], + "roberta": ["key", "value", "output.dense"], + "opt": ["q_proj", "k_proj", "fc2"], + "gptj": ["q_proj", "v_proj", "fc_out"], + "gpt_neox": ["query_key_value", "dense_4h_to_h"], + "gpt_neo": ["q_proj", "v_proj", "c_proj"], + "bart": ["q_proj", "v_proj", "fc2"], + "gpt_bigcode": ["c_attn", "mlp.c_proj"], + "llama": ["k_proj", "v_proj", "down_proj"], + "mistral": ["k_proj", "v_proj", "down_proj"], + "mixtral": ["k_proj", "v_proj", "w2"], + "bert": ["key", "value", "output.dense"], + "deberta-v2": ["key_proj", "value_proj", "output.dense"], + "deberta": ["in_proj", "output.dense"], + "RefinedWebModel": ["query_key_value", "dense_4h_to_h"], + "RefinedWeb": ["query_key_value", "dense_4h_to_h"], + "falcon": ["query_key_value", "dense_4h_to_h"], + "phi": ["q_proj", "v_proj", "fc2"], + "gemma": ["q_proj", "v_proj", "down_proj"], +} + +TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING = { + "t5": ["wo"], + "mt5": [], + "gpt2": ["mlp.c_proj"], + "bloom": ["mlp.dense_4h_to_h"], + "roberta": ["output.dense"], + "opt": ["fc2"], + "gptj": ["fc_out"], + "gpt_neox": ["dense_4h_to_h"], + "gpt_neo": ["c_proj"], + "bart": ["fc2"], + "gpt_bigcode": ["mlp.c_proj"], + "llama": ["down_proj"], + "mistral": ["down_proj"], + "mixtral": ["w2"], + "bert": ["output.dense"], + "deberta-v2": ["output.dense"], + "deberta": ["output.dense"], + "RefinedWeb": ["dense_4h_to_h"], + "RefinedWebModel": ["dense_4h_to_h"], + "falcon": ["dense_4h_to_h"], + "phi": ["fc2"], + "gemma": ["down_proj"], +} + +TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = { + "t5": ["q", "k", "v", "o", "wi", "wo"], + "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"], + "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], + "gpt2": ["c_attn"], + "bloom": ["query_key_value"], + "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"], + "gptj": ["q_proj", "v_proj"], + "gpt_neox": ["query_key_value"], + "gpt_neo": ["q_proj", "v_proj"], + "llama": ["q_proj", "v_proj"], + "bert": ["query", "value"], + "roberta": ["query", "key", "value", "dense"], + # "xlm-roberta": ["query", "value"], + # "electra": ["query", "value"], + "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"], + "gpt_bigcode": ["c_attn"], + "deberta": ["in_proj"], + # "layoutlm": ["query", "value"], +} + +WEIGHTS_NAME = "adapter_model.bin" +SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors" +CONFIG_NAME = "adapter_config.json" +EMBEDDING_LAYER_NAMES = ["embed_tokens", "lm_head"] +INCLUDE_LINEAR_LAYERS_SHORTHAND = "all-linear" +TOKENIZER_CONFIG_NAME = "tokenizer_config.json" diff --git a/MoRA/peft_mora/utils/integrations.py b/MoRA/peft_mora/utils/integrations.py new file mode 100644 index 0000000000000000000000000000000000000000..3b0e139483ec12632a4183441628502e7122f179 --- /dev/null +++ b/MoRA/peft_mora/utils/integrations.py @@ -0,0 +1,39 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from contextlib import contextmanager + +import packaging.version +import torch +import transformers + + +@contextmanager +def gather_params_ctx(module: torch.nn.Module, modifier_rank: int = 0): + """Call DeepSpeed GatheredParameters context manager if DeepSpeed is enabled, otherwise do nothing.""" + if packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.33.0"): + from transformers.integrations import is_deepspeed_zero3_enabled + else: + from transformers.deepspeed import is_deepspeed_zero3_enabled + + if not is_deepspeed_zero3_enabled(): + yield + return + + import deepspeed + + params_to_gather = module.parameters() + with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=modifier_rank): + yield + return diff --git a/MoRA/peft_mora/utils/loftq_utils.py b/MoRA/peft_mora/utils/loftq_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9f83417f3ca350f198b8d759cfc2231ee6b56c00 --- /dev/null +++ b/MoRA/peft_mora/utils/loftq_utils.py @@ -0,0 +1,229 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Reference code: https://github.com/yxli2123/LoftQ/blob/main/utils.py +# Reference paper: https://arxiv.org/abs/2310.08659 + +import logging +from typing import Union + +import torch + +from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available + + +if is_bnb_available(): + import bitsandbytes as bnb + + +class NFQuantizer: + def __init__(self, num_bits=2, device="cuda", method="normal", block_size=64, *args, **kwargs): + super().__init__(*args, **kwargs) + self.num_bits = num_bits + self.device = device + self.method = method + self.block_size = block_size + if self.method == "normal": + self.norm_lookup_table = self.create_normal_map(num_bits=self.num_bits) + self.norm_lookup_table = self.norm_lookup_table.to(device) + elif self.method == "uniform": + self.norm_lookup_table = self.create_uniform_map(num_bits=self.num_bits) + self.norm_lookup_table = self.norm_lookup_table.to(device) + else: + raise NotImplementedError("Other quantization methods not supported yet.") + + @staticmethod + def create_uniform_map(symmetric=False, num_bits=4): + if symmetric: + # print("symmetric uniform quantization") + negative = torch.linspace(-1, 0, 2 ** (num_bits - 1)) + positive = torch.linspace(0, 1, 2 ** (num_bits - 1)) + table = torch.cat([negative, positive[1:]]) + else: + # print("asymmetric uniform quantization") + table = torch.linspace(-1, 1, 2**num_bits) + return table + + @staticmethod + def create_normal_map(offset=0.9677083, symmetric=False, num_bits=2): + try: + from scipy.stats import norm + except ImportError: + raise ImportError("The required package 'scipy' is not installed. Please install it to continue.") + + variations = 2**num_bits + if symmetric: + v = norm.ppf(torch.linspace(1 - offset, offset, variations + 1)).tolist() + values = [] + for index in range(len(v) - 1): + values.append(0.5 * v[index] + 0.5 * v[index + 1]) + v = values + else: + # one more positive value, this is an asymmetric type + v1 = norm.ppf(torch.linspace(offset, 0.5, variations // 2 + 1)[:-1]).tolist() + v2 = [0] + v3 = (-norm.ppf(torch.linspace(offset, 0.5, variations // 2)[:-1])).tolist() + v = v1 + v2 + v3 + + values = torch.Tensor(v) + values = values.sort().values + values /= values.max() + return values + + def quantize_tensor(self, weight): + max_abs = torch.abs(weight).max() + weight_normed = weight / max_abs + + weight_normed_expanded = weight_normed.unsqueeze(-1) + + # Reshape L to have the same number of dimensions as X_expanded + L_reshaped = torch.tensor(self.norm_lookup_table).reshape(1, -1) + + # Calculate the absolute difference between X_expanded and L_reshaped + abs_diff = torch.abs(weight_normed_expanded - L_reshaped) + + # Find the index of the minimum absolute difference for each element + qweight = torch.argmin(abs_diff, dim=-1) + return qweight, max_abs + + def dequantize_tensor(self, qweight, max_abs): + qweight_flatten = qweight.flatten() + + weight_normed = self.norm_lookup_table[qweight_flatten] + weight = weight_normed * max_abs + + weight = weight.reshape(qweight.shape) + + return weight + + def quantize_block(self, weight): + if len(weight.shape) != 2: + raise ValueError(f"Only support 2D matrix, but your input has {len(weight.shape)} dimensions.") + if weight.shape[0] * weight.shape[1] % self.block_size != 0: + raise ValueError( + f"Weight with shape ({weight.shape[0]} x {weight.shape[1]}) " + f"is not dividable by block size {self.block_size}." + ) + + M, N = weight.shape + device = weight.device + + # Quantization + weight_flatten = weight.flatten() # (M*N, ) + weight_block = weight_flatten.reshape(-1, self.block_size) # (L, B), L = M * N / B + if self.method == "normal": + weight_max = weight_block.abs().max(dim=-1)[0] # (L, 1) + elif self.method == "uniform": + weight_max = weight_block.mean(dim=-1) + 2.5 * weight_block.std(dim=-1) + else: + raise NotImplementedError("Method not supported yet.") + weight_max = weight_max.unsqueeze(-1) + weight_divabs = weight_block / weight_max # (L, B) + weight_divabs = weight_divabs.unsqueeze(-1) # (L, B, 1) + L_reshaped = self.norm_lookup_table.reshape(1, -1) # (1, 2**K) + + abs_diff = torch.abs(weight_divabs - L_reshaped) # (L, B, 2**K) + qweight = torch.argmin(abs_diff, dim=-1) # (L, B) + + # Pack multiple k-bit into uint8 + qweight = qweight.reshape(-1, 8 // self.num_bits) + qweight_pack = torch.zeros((M * N // 8 * self.num_bits, 1), dtype=torch.uint8, device=device) + + # data format example: + # [1, 0, 3, 2] or [01, 00, 11, 10] -> [10110001], LIFO + for i in range(8 // self.num_bits): + qweight[:, i] = qweight[:, i] << i * self.num_bits + qweight_pack[:, 0] |= qweight[:, i] + + return qweight_pack, weight_max, weight.shape + + def dequantize_block(self, qweight, weight_max, weight_shape): + # unpack weight + device = qweight.device + weight = torch.zeros((qweight.shape[0], 8 // self.num_bits), dtype=torch.float32, device=device) + for i in range(8 // self.num_bits): + lookup_table_idx = qweight.to(torch.long) % 2**self.num_bits # get the most right 2 bits + lookup_table_idx = lookup_table_idx.to(torch.long) + weight[:, i] = self.norm_lookup_table[lookup_table_idx].squeeze() + qweight = qweight >> self.num_bits # right shift 2 bits of the original data + + weight_block = weight.reshape(-1, self.block_size) + weight = weight_block * weight_max + weight = weight.reshape(weight_shape) + + return weight + + +def _low_rank_decomposition(weight, reduced_rank=32): + """ + :param weight: The matrix to decompose, of shape (H, W) :param reduced_rank: the final rank :return: + """ + matrix_dimension = len(weight.size()) + if matrix_dimension != 2: + raise ValueError(f"Only support 2D matrix, but your input has {matrix_dimension} dimensions.") + + # Use SVD to decompose a matrix, default full_matrices is False to save parameters + U, S, Vh = torch.linalg.svd(weight, full_matrices=False) + + L = U @ (torch.sqrt(torch.diag(S)[:, 0:reduced_rank])) + R = torch.sqrt(torch.diag(S)[0:reduced_rank, :]) @ Vh + + return {"L": L, "R": R, "U": U, "S": S, "Vh": Vh, "reduced_rank": reduced_rank} + + +@torch.no_grad() +def loftq_init(weight: Union[torch.Tensor, torch.nn.Parameter], num_bits: int, reduced_rank: int, num_iter=1): + if num_bits not in [2, 4, 8]: + raise ValueError("Only support 2, 4, 8 bits quantization") + if num_iter <= 0: + raise ValueError("Number of iterations must be greater than 0") + + out_feature, in_feature = weight.size() + device = weight.device + dtype = weight.dtype + + logging.info( + f"Weight: ({out_feature}, {in_feature}) | Rank: {reduced_rank} " + f"| Num Iter: {num_iter} | Num Bits: {num_bits}" + ) + if not is_bnb_4bit_available() or num_bits in [2, 8]: + quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64) + compute_device = device + else: + compute_device = "cuda" + + weight = weight.to(device=compute_device, dtype=torch.float32) + res = weight.clone() + for i in range(num_iter): + torch.cuda.empty_cache() + # Quantization + if num_bits == 4 and is_bnb_4bit_available(): + qweight = bnb.nn.Params4bit( + res.to("cpu"), requires_grad=False, compress_statistics=False, quant_type="nf4" + ).to(compute_device) + dequantized_weight = bnb.functional.dequantize_4bit(qweight.data, qweight.quant_state) + else: + quantized_weight, max_abs, shape = quantizer.quantize_block(res) + dequantized_weight = quantizer.dequantize_block(quantized_weight, max_abs, shape) + + res = weight - dequantized_weight + + # Decompose the residual by SVD + output = _low_rank_decomposition(res, reduced_rank=reduced_rank) + L, R, reduced_rank = output["L"], output["R"], output["reduced_rank"] + res = weight - torch.mm(L, R) + + lora_A, lora_B = R, L + + return dequantized_weight.to(device=device, dtype=dtype), lora_A, lora_B diff --git a/MoRA/peft_mora/utils/merge_utils.py b/MoRA/peft_mora/utils/merge_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..900cb878e2d785e5e800caa58642f0532f4d574b --- /dev/null +++ b/MoRA/peft_mora/utils/merge_utils.py @@ -0,0 +1,268 @@ +# Copyright 2024-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from typing import List, Literal + +import torch + + +def reshape_weight_task_tensors(task_tensors, weights): + """ + Reshapes `weights` to match the shape of `task_tensors` by unsqeezing in the remaining dimenions. + + Args: + task_tensors (`torch.Tensor`): The tensors that will be used to reshape `weights`. + weights (`torch.Tensor`): The tensor to be reshaped. + + Returns: + `torch.Tensor`: The reshaped tensor. + """ + new_shape = weights.shape + (1,) * (task_tensors.dim() - weights.dim()) + weights = weights.view(new_shape) + return weights + + +def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> torch.Tensor: + """ + Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction + `density`. + + Args: + tensor (`torch.Tensor`):The tensor to prune. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + + Returns: + `torch.Tensor`: The tensor with the pruned weights. + """ + mask = torch.zeros_like(tensor).reshape(-1) + k = int(density * tensor.numel()) + top_k = torch.topk(tensor.abs().reshape(-1), k=k, largest=True) + mask[top_k[1]] = 1 + return tensor * mask.reshape(tensor.shape) + + +def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor: + """ + Prune random values based on the specified fraction `density`. + + Args: + tensor (`torch.Tensor`):The tensor to prune. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor. + + Returns: + `torch.Tensor`: The pruned tensor. + """ + mask = torch.bernoulli(torch.full_like(input=tensor, fill_value=density)) + pruned_tensor = tensor * mask + if rescale: + torch.div(input=pruned_tensor, other=density) + return pruned_tensor + + +def prune( + tensor: torch.Tensor, density: float, method: Literal["magnitude", "random"], rescale: bool = False +) -> torch.Tensor: + """ + Prune the values of task tensors based on the `method`. + + Args: + tensor (`torch.Tensor`):The tensor to prune. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + method (`str`):The method to use to prune. Should be one of ["magnitude", "random"]. + rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor. + + Returns: + `torch.Tensor`: The pruned tensor. + """ + if density >= 1: + warnings.warn(f"The density {density} is greater than or equal to 1, no pruning will be performed.") + return tensor + elif density < 0: + raise ValueError(f"Density should be >= 0, got {density}") + if method == "magnitude": + return magnitude_based_pruning(tensor, density) + elif method == "random": + return random_pruning(tensor, density, rescale=rescale) + else: + raise ValueError(f"Unknown method {method}") + + +def calculate_majority_sign_mask( + tensor: torch.Tensor, method: Literal["total", "frequency"] = "total" +) -> torch.Tensor: + """ + Get the mask of the majority sign across the task tensors. Task tensors are stacked on dimension 0. + + Args: + tensor (`torch.Tensor`):The tensor to get the mask from. + method (`str`):The method to use to get the mask. Should be one of ["total", "frequency"]. + + Returns: + `torch.Tensor`: The majority sign mask. + """ + + sign = tensor.sign() + if method == "total": + sign_magnitude = tensor.sum(dim=0) + elif method == "frequency": + sign_magnitude = sign.sum(dim=0) + else: + raise RuntimeError(f'Unimplemented mask method "{method}"') + majority_sign = torch.where(sign_magnitude >= 0, 1, -1) + return sign == majority_sign + + +def disjoint_merge(task_tensors: torch.Tensor, majority_sign_mask: torch.Tensor) -> torch.Tensor: + """ + Merge the task tensors using disjoint merge. + + Args: + task_tensors (`torch.Tensor`):The task tensors to merge. + majority_sign_mask (`torch.Tensor`):The mask of the majority sign across the task tensors. + + Returns: + `torch.Tensor`: The merged tensor. + """ + mixed_task_tensors = (task_tensors * majority_sign_mask).sum(dim=0) + num_params_preserved = majority_sign_mask.sum(dim=0) + return mixed_task_tensors / torch.clamp(num_params_preserved, min=1.0) + + +def task_arithmetic(task_tensors: List[torch.Tensor], weights: torch.Tensor) -> torch.Tensor: + """ + Merge the task tensors using `task arithmetic`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + + Returns: + `torch.Tensor`: The merged tensor. + """ + task_tensors = torch.stack(task_tensors, dim=0) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + mixed_task_tensors = weighted_task_tensors.sum(dim=0) + return mixed_task_tensors + + +def magnitude_prune(task_tensors: List[torch.Tensor], weights: torch.Tensor, density: float) -> torch.Tensor: + """ + Merge the task tensors using `task arithmetic`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + density (`float`): The fraction of values to preserve. Should be in [0,1]. + + Returns: + `torch.Tensor`: The merged tensor. + """ + # sparsify + task_tensors = [prune(tensor, density, method="magnitude") for tensor in task_tensors] + task_tensors = torch.stack(task_tensors, dim=0) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + mixed_task_tensors = weighted_task_tensors.sum(dim=0) + return mixed_task_tensors + + +def ties( + task_tensors: List[torch.Tensor], + weights: torch.Tensor, + density: float, + majority_sign_method: Literal["total", "frequency"] = "total", +) -> torch.Tensor: + """ + Merge the task tensors using `ties`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + majority_sign_method (`str`): + The method to use to get the majority sign mask. Should be one of ["total", "frequency"]. + + Returns: + `torch.Tensor`: The merged tensor. + """ + # sparsify + task_tensors = [prune(tensor, density, method="magnitude") for tensor in task_tensors] + task_tensors = torch.stack(task_tensors, dim=0) + # Elect Sign + majority_sign_mask = calculate_majority_sign_mask(task_tensors, method=majority_sign_method) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + # Disjoint Merge + mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask) + return mixed_task_tensors + + +def dare_linear(task_tensors: List[torch.Tensor], weights: torch.Tensor, density: float) -> torch.Tensor: + """ + Merge the task tensors using `dare linear`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + + Returns: + `torch.Tensor`: The merged tensor. + """ + # sparsify + task_tensors = [prune(tensor, density, method="random", rescale=True) for tensor in task_tensors] + task_tensors = torch.stack(task_tensors, dim=0) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + mixed_task_tensors = weighted_task_tensors.sum(dim=0) + return mixed_task_tensors + + +def dare_ties( + task_tensors: List[torch.Tensor], + weights: torch.Tensor, + density: float, + majority_sign_method: Literal["total", "frequency"] = "total", +) -> torch.Tensor: + """ + Merge the task tensors using `dare ties`. + + Args: + task_tensors(`List[torch.Tensor]`):The task tensors to merge. + weights (`torch.Tensor`):The weights of the task tensors. + density (`float`):The fraction of values to preserve. Should be in [0,1]. + majority_sign_method (`str`): + The method to use to get the majority sign mask. Should be one of ["total", "frequency"]. + + Returns: + `torch.Tensor`: The merged tensor. + """ + # sparsify + task_tensors = [prune(tensor, density, method="random", rescale=True) for tensor in task_tensors] + task_tensors = torch.stack(task_tensors, dim=0) + # Elect Sign + majority_sign_mask = calculate_majority_sign_mask(task_tensors, method=majority_sign_method) + # weighted task tensors + weights = reshape_weight_task_tensors(task_tensors, weights) + weighted_task_tensors = task_tensors * weights + # Disjoint Merge + mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask) + return mixed_task_tensors diff --git a/MoRA/peft_mora/utils/other.py b/MoRA/peft_mora/utils/other.py new file mode 100644 index 0000000000000000000000000000000000000000..71ad4848c0e9d037f3ebc02c1e3fccd1cd30958c --- /dev/null +++ b/MoRA/peft_mora/utils/other.py @@ -0,0 +1,593 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import inspect +import os +import warnings +from contextlib import nullcontext +from typing import Optional, Tuple + +import accelerate +import torch +from accelerate.hooks import add_hook_to_module, remove_hook_from_module +from accelerate.utils import is_npu_available, is_xpu_available +from huggingface_hub import file_exists +from huggingface_hub.utils import EntryNotFoundError, HFValidationError +from safetensors.torch import storage_ptr, storage_size + +from ..import_utils import is_auto_gptq_available, is_torch_tpu_available +from .constants import ( + CONFIG_NAME, + EMBEDDING_LAYER_NAMES, + INCLUDE_LINEAR_LAYERS_SHORTHAND, + SAFETENSORS_WEIGHTS_NAME, + TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, + WEIGHTS_NAME, + bloom_model_postprocess_past_key_value, + starcoder_model_postprocess_past_key_value, +) + + +__all__ = [ + "CONFIG_NAME", + "EMBEDDING_LAYER_NAMES", + "SAFETENSORS_WEIGHTS_NAME", + "TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING", + "TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING", + "WEIGHTS_NAME", + "INCLUDE_LINEAR_LAYERS_SHORTHAND", + "bloom_model_postprocess_past_key_value", + "starcoder_model_postprocess_past_key_value", +] + + +# Get current device name based on available devices +def infer_device() -> str: + if torch.cuda.is_available(): + return "cuda" + elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): + return "mps" + elif is_xpu_available(): + return "xpu" + elif is_npu_available(): + return "npu" + return "cpu" + + +def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True, gradient_checkpointing_kwargs=None): + r""" + Note this method only works for `transformers` models. + + This method wraps the entire protocol for preparing a model before running a training. This includes: + 1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm + head to fp32 + + Args: + model (`transformers.PreTrainedModel`): + The loaded model from `transformers` + use_gradient_checkpointing (`bool`, *optional*, defaults to `True`): + If True, use gradient checkpointing to save memory at the expense of slower backward pass. + gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`): + Keyword arguments to pass to the gradient checkpointing function, please refer to the documentation of + `torch.utils.checkpoint.checkpoint` for more details about the arguments that you can pass to that method. + Note this is only available in the latest transformers versions (> 4.34.1). + """ + loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False) + is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq" + is_aqlm_quantized = getattr(model, "quantization_method", None) == "aqlm" + if gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {} + + for name, param in model.named_parameters(): + # freeze base model's layers + param.requires_grad = False + + if not is_gptq_quantized and not is_aqlm_quantized: + # cast all non INT8 parameters to fp32 + for param in model.parameters(): + if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16): + param.data = param.data.to(torch.float32) + + if (loaded_in_kbit or is_gptq_quantized or is_aqlm_quantized) and use_gradient_checkpointing: + # When having `use_reentrant=False` + gradient_checkpointing, there is no need for this hack + if "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]: + # For backward compatibility + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + else: + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + + # To support older transformers versions, check if the model supports gradient_checkpointing_kwargs + _supports_gc_kwargs = "gradient_checkpointing_kwargs" in list( + inspect.signature(model.gradient_checkpointing_enable).parameters + ) + + if not _supports_gc_kwargs and len(gradient_checkpointing_kwargs) > 0: + warnings.warn( + "gradient_checkpointing_kwargs is not supported in this version of transformers. The passed kwargs will be ignored." + " if you want to use that feature, please upgrade to the latest version of transformers.", + FutureWarning, + ) + + gc_enable_kwargs = ( + {} if not _supports_gc_kwargs else {"gradient_checkpointing_kwargs": gradient_checkpointing_kwargs} + ) + + # enable gradient checkpointing for memory efficiency + model.gradient_checkpointing_enable(**gc_enable_kwargs) + return model + + +# For backward compatibility +def prepare_model_for_int8_training(*args, **kwargs): + warnings.warn( + "prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.", + FutureWarning, + ) + return prepare_model_for_kbit_training(*args, **kwargs) + + +# copied from transformers.models.bart.modeling_bart +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input ids + pad_token_id (`int`): The id of the `padding` token. + decoder_start_token_id (`int`): The id of the `start` token. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class ModulesToSaveWrapper(torch.nn.Module): + def __init__(self, module_to_save, adapter_name): + super().__init__() + self.original_module = module_to_save + self.modules_to_save = torch.nn.ModuleDict({}) + self._active_adapter = adapter_name + self._disable_adapters = False + self.update(adapter_name) + self.check_module() + + def check_module(self): + """Perform some sanity checks on the module to ensure that it works""" + # Try to anticipate some modules that users could try to target that would not work. + # Note: It's not possible to check hasattr(module, "forward"), since that returns True for ModuleDict and + # ModuleList, even though their forward methods cannot be called + forbidden_classes = (torch.nn.ModuleDict, torch.nn.ModuleList, torch.nn.ParameterDict, torch.nn.ParameterList) + if isinstance(self.original_module, forbidden_classes): + cls_name = self.original_module.__class__.__name__ + raise TypeError(f"modules_to_save cannot be applied to modules of type {cls_name}") + + @property + def disable_adapters(self) -> bool: + # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method + return self._disable_adapters + + @property + def active_adapter(self) -> str: + # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method + return self._active_adapter + + @property + def weight(self): + if self.active_adapter not in self.modules_to_save: + return self.original_module.weight + return self.modules_to_save[self.active_adapter].weight + + def update(self, adapter_name): + context_manager = nullcontext() + for _, param in self.original_module.named_parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + import deepspeed + + context_manager = deepspeed.zero.GatheredParameters(self.original_module.parameters(), modifier_rank=0) + break + with context_manager: + self.modules_to_save.update(torch.nn.ModuleDict({adapter_name: copy.deepcopy(self.original_module)})) + + if hasattr(self.modules_to_save[adapter_name], "_hf_hook"): + old_hook = self.modules_to_save[adapter_name]._hf_hook + new_hook = self._create_new_hook(old_hook) + remove_hook_from_module(self.modules_to_save[adapter_name]) + add_hook_to_module(self.modules_to_save[adapter_name], new_hook) + + self.original_module.requires_grad_(False) + if adapter_name == self.active_adapter: + self.modules_to_save[adapter_name].requires_grad_(True) + + def _create_new_hook(self, old_hook): + r""" + Creates a new hook based on the old hook. Use it only if you know what you are doing ! + """ + old_hook_cls = getattr(accelerate.hooks, old_hook.__class__.__name__) + old_hook_attr = old_hook.__dict__ + filtered_old_hook_attr = {} + old_hook_init_signature = inspect.signature(old_hook_cls.__init__) + for k in old_hook_attr.keys(): + if k in old_hook_init_signature.parameters: + filtered_old_hook_attr[k] = old_hook_attr[k] + new_hook = old_hook_cls(**filtered_old_hook_attr) + return new_hook + + def forward(self, *args, **kwargs): + if self.disable_adapters or (self.active_adapter not in self.modules_to_save): + return self.original_module(*args, **kwargs) + return self.modules_to_save[self.active_adapter](*args, **kwargs) + + def enable_adapters(self, enabled: bool): + """Toggle the enabling and disabling of adapters + + Takes care of setting the requires_grad flag for the adapter weights. + + Args: + enabled (bool): True to enable adapters, False to disable adapters + """ + if self._disable_adapters is not enabled: + # already in the desired state, do nothing + return + + if enabled: + self.original_module.requires_grad_(False) + self.modules_to_save[self.active_adapter].requires_grad_(True) + self._disable_adapters = False + else: + self.original_module.requires_grad_(True) + self.modules_to_save.requires_grad_(False) + self._disable_adapters = True + + def set_adapter(self, adapter_name: str): + """Set the active adapter + + Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (str): The name of the adapter to set as active + """ + if adapter_name not in self.modules_to_save: + raise ValueError(f"Adapter {adapter_name} not found in {self.modules_to_save.keys()}") + + self.modules_to_save[self.active_adapter].requires_grad_(False) + self.modules_to_save[adapter_name].requires_grad_(True) + self._active_adapter = adapter_name + + +def _get_submodules(model, key): + parent = model.get_submodule(".".join(key.split(".")[:-1])) + target_name = key.split(".")[-1] + target = model.get_submodule(key) + return parent, target, target_name + + +def _freeze_adapter(model, adapter_name): + for n, p in model.named_parameters(): + if adapter_name in n: + p.requires_grad = False + + +def _set_trainable(model, adapter_name): + key_list = [key for key, _ in model.named_modules()] + for key in key_list: + target_module_found = any(key.endswith(target_key) for target_key in model.modules_to_save) + if target_module_found: + parent, target, target_name = _get_submodules(model, key) + if isinstance(target, ModulesToSaveWrapper): + target.update(adapter_name) + target.set_adapter(target.active_adapter) + else: + new_module = ModulesToSaveWrapper(target, adapter_name) + new_module.set_adapter(adapter_name) + setattr(parent, target_name, new_module) + + +def _set_adapter(model, adapter_name): + def check_adapter_name(adapter_name): + if isinstance(adapter_name, str): + return adapter_name + + # adapter_name is a list of str + if len(adapter_name) > 1: + raise ValueError("Only one adapter can be set at a time for modules_to_save") + elif len(adapter_name) == 0: + raise ValueError("Please specify at least one adapter to set") + adapter_name = adapter_name[0] + return adapter_name + + for module in model.modules(): + if isinstance(module, ModulesToSaveWrapper): + # only check the adapter_name if we actually encounter a ModulesToSaveWrapper, otherwise we don't care + adapter_name = check_adapter_name(adapter_name) + module.set_adapter(adapter_name) + + +def _prepare_prompt_learning_config(peft_config, model_config): + if peft_config.num_layers is None: + if "num_hidden_layers" in model_config: + num_layers = model_config["num_hidden_layers"] + elif "num_layers" in model_config: + num_layers = model_config["num_layers"] + elif "n_layer" in model_config: + num_layers = model_config["n_layer"] + else: + raise ValueError("Please specify `num_layers` in `peft_config`") + peft_config.num_layers = num_layers + + if peft_config.token_dim is None: + if "hidden_size" in model_config: + token_dim = model_config["hidden_size"] + elif "n_embd" in model_config: + token_dim = model_config["n_embd"] + elif "d_model" in model_config: + token_dim = model_config["d_model"] + else: + raise ValueError("Please specify `token_dim` in `peft_config`") + peft_config.token_dim = token_dim + + if peft_config.num_attention_heads is None: + if "num_attention_heads" in model_config: + num_attention_heads = model_config["num_attention_heads"] + elif "n_head" in model_config: + num_attention_heads = model_config["n_head"] + elif "num_heads" in model_config: + num_attention_heads = model_config["num_heads"] + elif "encoder_attention_heads" in model_config: + num_attention_heads = model_config["encoder_attention_heads"] + else: + raise ValueError("Please specify `num_attention_heads` in `peft_config`") + peft_config.num_attention_heads = num_attention_heads + + if getattr(peft_config, "encoder_hidden_size", None) is None: + setattr(peft_config, "encoder_hidden_size", peft_config.token_dim) + + return peft_config + + +def fsdp_auto_wrap_policy(model): + import functools + import os + + from accelerate import FullyShardedDataParallelPlugin + from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy + + from ..tuners import PrefixEncoder, PromptEmbedding, PromptEncoder + + default_transformer_cls_names_to_wrap = ( + ",".join(model._no_split_modules) if getattr(model, "_no_split_modules", None) is not None else "" + ) + transformer_cls_names_to_wrap = os.environ.get( + "FSDP_TRANSFORMER_CLS_TO_WRAP", default_transformer_cls_names_to_wrap + ).split(",") + transformer_cls_to_wrap = {PrefixEncoder, PromptEncoder, PromptEmbedding} + for layer_class in transformer_cls_names_to_wrap: + transformer_cls = FullyShardedDataParallelPlugin.get_module_class_from_name(model, layer_class) + if transformer_cls is None: + raise Exception("Could not find the transformer layer class to wrap in the model.") + else: + transformer_cls_to_wrap.add(transformer_cls) + + def lambda_policy_fn(module): + if ( + len(list(module.named_children())) == 0 + and getattr(module, "weight", None) is not None + and module.weight.requires_grad + ): + return True + return False + + lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn) + transformer_wrap_policy = functools.partial( + transformer_auto_wrap_policy, + transformer_layer_cls=transformer_cls_to_wrap, + ) + + auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy]) + return auto_wrap_policy + + +def transpose(weight, fan_in_fan_out): + if not fan_in_fan_out: + return weight + + if isinstance(weight, torch.nn.Parameter): + return torch.nn.Parameter(weight.T) + return weight.T + + +def _is_valid_match(key: str, target_key: str): + """ + Helper function to match module names target_key and key. Makes sure that either the key is exactly the target_key + or the target_key is a submodule of key + """ + if key.endswith(target_key): + if len(key) > len(target_key): + return key.endswith("." + target_key) # must be a sub module + return True + return False + + +def _get_batch_size(input_ids: Optional[torch.Tensor], inputs_embeds: Optional[torch.Tensor]) -> int: + """Get the batch size based on either input_ids or input_embeds + + Raises an ValueError if both are None. + + """ + if (input_ids is None) and (inputs_embeds is None): + raise ValueError("You have to provide either input_ids or inputs_embeds") + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + return batch_size + + +def get_quantization_config(model: torch.nn.Module, method: str): + """ + Get the quantization config of the related quantization method + """ + if ( + hasattr(model, "config") + and hasattr(model.config, "quantization_config") + and (getattr(model, "quantization_method", None) == method) + ): + return model.config.quantization_config + return None + + +def get_auto_gptq_quant_linear(gptq_quantization_config): + """ + Get the right AutoGPTQQuantLinear class based on the quantization config file + """ + if gptq_quantization_config is not None and is_auto_gptq_available(): + from auto_gptq.utils.import_utils import dynamically_import_QuantLinear + + desc_act = gptq_quantization_config.desc_act + group_size = gptq_quantization_config.group_size + bits = gptq_quantization_config.bits + if hasattr(gptq_quantization_config, "use_exllama"): + use_exllama = gptq_quantization_config.use_exllama + else: + use_exllama = not gptq_quantization_config.disable_exllama + if hasattr(gptq_quantization_config, "exllama_config"): + exllama_version = gptq_quantization_config.exllama_config["version"] + else: + exllama_version = 1 + AutoGPTQQuantLinear = dynamically_import_QuantLinear( + use_triton=False, + desc_act=desc_act, + group_size=group_size, + bits=bits, + disable_exllama=not (use_exllama and exllama_version == 1), + disable_exllamav2=not (use_exllama and exllama_version == 2), + ) + return AutoGPTQQuantLinear + return None + + +def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]: + """ + Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For + example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is + guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with + non-overlapping lifetimes may have the same id. + + This method is the exact same copy of + https://github.com/huggingface/transformers/blob/main/src/transformers/pytorch_utils.py#L282C1-L300C58 but we added + it here manually to avoid import issue with old versions of transformers. + """ + if tensor.device.type == "xla" and is_torch_tpu_available(): + # NOTE: xla tensors dont have storage + # use some other unique id to distinguish. + # this is a XLA tensor, it must be created using torch_xla's + # device. So the following import is safe: + import torch_xla + + unique_id = torch_xla._XLAC._xla_get_tensor_id(tensor) + else: + unique_id = storage_ptr(tensor) + + return tensor.device, unique_id, storage_size(tensor) + + +def cast_mixed_precision_params(model, dtype): + """ + Cast all non-trainable parameters of the model to the given `dtype`. The `dtype` can be `torch.float16` or + `torch.bfloat16` as per the mixed-precision training you are performing. The trainable parameters are cast to full + precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for + non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when using + automatic mixed-precision training. + + Args: + model (`torch.nn.Module`): + The model to cast the non-trainable parameters of. + dtype (`torch.dtype`): + The dtype to cast the non-trainable parameters to. The `dtype` can be `torch.float16` or + `torch.bfloat16` as per the mixed-precision training you are performing. + """ + for p in model.parameters(): + if not p.requires_grad: + p.data = p.to(dtype) + else: + p.data = p.to(torch.float32) + + +def str_to_bool(value: str) -> int: + """ + Converts a string representation of truth to `True` (1) or `False` (0). + + True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`; + """ + # same as function as in accelerate.utils, which replaces the deprecated distutils.util.strtobool + value = value.lower() + if value in ("y", "yes", "t", "true", "on", "1"): + return 1 + elif value in ("n", "no", "f", "false", "off", "0"): + return 0 + else: + raise ValueError(f"invalid truth value {value}") + + +def check_file_exists_on_hf_hub(repo_id: str, filename: str, **kwargs) -> Optional[bool]: + """Check if a file exists on HF Hub, if check was not successful returns None instead of erroring. + + Respect offline mode if set. + + """ + exists: Optional[bool] = None + if str_to_bool(os.environ.get("HF_HUB_OFFLINE", "0")): + # user set offline mode, cannot check + return exists + + try: + exists = file_exists(repo_id, filename, **kwargs) + except (HFValidationError, EntryNotFoundError): + # error, exists stays None + pass + except Exception as e: + warnings.warn( + f"Unable to fetch remote file due to the following error {e} - silently ignoring the lookup" + f" for the file {filename} in {repo_id}." + ) + + return exists diff --git a/MoRA/peft_mora/utils/peft_types.py b/MoRA/peft_mora/utils/peft_types.py new file mode 100644 index 0000000000000000000000000000000000000000..d4a84435dc60bae6fde33bdbbc5e24f03293c678 --- /dev/null +++ b/MoRA/peft_mora/utils/peft_types.py @@ -0,0 +1,73 @@ +# flake8: noqa +# There's no way to ignore "F401 '...' imported but unused" warnings in this +# module, but to preserve other warnings. So, don't check this module at all + +# coding=utf-8 +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import enum + + +class PeftType(str, enum.Enum): + """ + Enum class for the different types of adapters in PEFT. + + Supported PEFT types: + - PROMPT_TUNING + - MULTITASK_PROMPT_TUNING + - P_TUNING + - PREFIX_TUNING + - LORA + - ADALORA + - ADAPTION_PROMPT + - IA3 + - LOHA + - LOKR + - OFT + """ + + PROMPT_TUNING = "PROMPT_TUNING" + MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING" + P_TUNING = "P_TUNING" + PREFIX_TUNING = "PREFIX_TUNING" + LORA = "LORA" + ADALORA = "ADALORA" + ADAPTION_PROMPT = "ADAPTION_PROMPT" + IA3 = "IA3" + LOHA = "LOHA" + LOKR = "LOKR" + OFT = "OFT" + POLY = "POLY" + + +class TaskType(str, enum.Enum): + """ + Enum class for the different types of tasks supported by PEFT. + + Overview of the supported task types: + - SEQ_CLS: Text classification. + - SEQ_2_SEQ_LM: Sequence-to-sequence language modeling. + - CAUSAL_LM: Causal language modeling. + - TOKEN_CLS: Token classification. + - QUESTION_ANS: Question answering. + - FEATURE_EXTRACTION: Feature extraction. Provides the hidden states which can be used as embeddings or features + for downstream tasks. + """ + + SEQ_CLS = "SEQ_CLS" + SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM" + CAUSAL_LM = "CAUSAL_LM" + TOKEN_CLS = "TOKEN_CLS" + QUESTION_ANS = "QUESTION_ANS" + FEATURE_EXTRACTION = "FEATURE_EXTRACTION" diff --git a/MoRA/peft_mora/utils/save_and_load.py b/MoRA/peft_mora/utils/save_and_load.py new file mode 100644 index 0000000000000000000000000000000000000000..5ac1264d89afb82d3ab3eadccb49163a75284aaa --- /dev/null +++ b/MoRA/peft_mora/utils/save_and_load.py @@ -0,0 +1,330 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import warnings +from typing import Optional + +import torch +from huggingface_hub import file_exists, hf_hub_download +from huggingface_hub.utils import EntryNotFoundError +from safetensors.torch import load_file as safe_load_file + +from .other import ( + EMBEDDING_LAYER_NAMES, + SAFETENSORS_WEIGHTS_NAME, + WEIGHTS_NAME, + check_file_exists_on_hf_hub, + infer_device, +) +from .peft_types import PeftType + + +def has_valid_embedding_base_layer(layer): + """Check if the layer has an embedding base layer""" + return hasattr(layer, "base_layer") and isinstance(layer.base_layer, (torch.nn.Linear, torch.nn.Embedding)) + + +def get_embedding_layer_name(model, layer, is_embedding_in_target_modules): + """Get the name of the embedding module for a given layer.""" + for name, module in model.named_modules(): + if (not is_embedding_in_target_modules and module == layer) or module == getattr(layer, "base_layer", None): + return name + return None + + +def get_peft_model_state_dict( + model, state_dict=None, adapter_name="default", unwrap_compiled=False, save_embedding_layers="auto" +): + """ + Get the state dict of the Peft model. + + Args: + model ([`PeftModel`]): The Peft model. When using torch.nn.DistributedDataParallel, DeepSpeed or FSDP, + the model should be the underlying model/unwrapped model (i.e. model.module). + state_dict (`dict`, *optional*, defaults to `None`): + The state dict of the model. If not provided, the state dict of the passed model will be used. + adapter_name (`str`, *optional*, defaults to `"default"`): + The name of the adapter whose state dict should be returned. + unwrap_compiled (`bool`, *optional*, defaults to `False`): + Whether to unwrap the model if torch.compile was used. + save_embedding_layers (`Union[bool, str]`, , *optional*, defaults to `auto`): + If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common embedding + layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available. Based on it + sets the boolean flag. This only works for πŸ€— transformers models. + """ + if unwrap_compiled: + model = getattr(model, "_orig_mod", model) + + config = model.peft_config[adapter_name] + if state_dict is None: + state_dict = model.state_dict() + if config.peft_type in (PeftType.LORA, PeftType.ADALORA): + # to_return = lora_state_dict(model, bias=model.peft_config.bias) + # adapted from `https://github.com/microsoft/LoRA/blob/main/loralib/utils.py` + # to be used directly with the state dict which is necessary when using DeepSpeed or FSDP + bias = config.bias + if bias == "none": + to_return = {k: state_dict[k] for k in state_dict if "lora_" in k} + elif bias == "all": + to_return = {k: state_dict[k] for k in state_dict if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + for k in state_dict: + if "lora_" in k: + to_return[k] = state_dict[k] + bias_name = k.split("lora_")[0] + "bias" + if bias_name in state_dict: + to_return[bias_name] = state_dict[bias_name] + else: + raise NotImplementedError + to_return = {k: v for k, v in to_return.items() if (("lora_" in k and adapter_name in k) or ("bias" in k))} + if config.peft_type == PeftType.ADALORA: + rank_pattern = config.rank_pattern + if rank_pattern is not None: + rank_pattern = {k.replace(f".{adapter_name}", ""): v for k, v in rank_pattern.items()} + config.rank_pattern = rank_pattern + to_return = model.resize_state_dict_by_rank_pattern(rank_pattern, to_return, adapter_name) + + elif config.peft_type == PeftType.LOHA: + to_return = {k: state_dict[k] for k in state_dict if "hada_" in k} + + elif config.peft_type == PeftType.LOKR: + to_return = {k: state_dict[k] for k in state_dict if "lokr_" in k} + + elif config.peft_type == PeftType.ADAPTION_PROMPT: + to_return = {k: state_dict[k] for k in state_dict if k.split(".")[-1].startswith("adaption_")} + elif config.is_prompt_learning: + to_return = {} + if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: + to_return["prefix_task_cols"] = model.prompt_encoder[adapter_name].prefix_task_cols + to_return["prefix_task_rows"] = model.prompt_encoder[adapter_name].prefix_task_rows + prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight + else: + if config.inference_mode: + prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight + else: + prompt_embeddings = model.get_prompt_embedding_to_save(adapter_name) + to_return["prompt_embeddings"] = prompt_embeddings + elif config.peft_type == PeftType.IA3: + to_return = {k: state_dict[k] for k in state_dict if "ia3_" in k} + elif config.peft_type == PeftType.OFT: + to_return = {k: state_dict[k] for k in state_dict if "oft_" in k} + elif config.peft_type == PeftType.POLY: + to_return = {k: state_dict[k] for k in state_dict if "poly_" in k} + else: + raise NotImplementedError + if getattr(model, "modules_to_save", None) is not None: + for key, value in state_dict.items(): + if any(f"{module_name}.modules_to_save.{adapter_name}" in key for module_name in model.modules_to_save): + to_return[key.replace("modules_to_save.", "")] = value + + # check the common embedding layers in `target_modules` to reset `save_embedding_layers` if necessary + is_embedding_in_target_modules = False + if ( + save_embedding_layers == "auto" + and hasattr(config, "target_modules") + and any(k in config.target_modules for k in EMBEDDING_LAYER_NAMES) + ): + warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.") + save_embedding_layers = is_embedding_in_target_modules = True + elif save_embedding_layers == "auto": + vocab_size = getattr(getattr(model, "config", None), "vocab_size", None) + model_id = getattr(config, "base_model_name_or_path", None) + + # For some models e.g. diffusers the text config file is stored in a subfolder + # we need to make sure we can download that config. + has_remote_config = False + + # ensure that this check is not performed in HF offline mode, see #1452 + if model_id is not None: + exists = check_file_exists_on_hf_hub(model_id, "config.json") + if exists is None: + # check failed, could not determine if it exists or not + warnings.warn( + f"Could not find a config file in {model_id} - will assume that the vocabulary was not modified." + ) + has_remote_config = False + else: + has_remote_config = exists + + # check if the vocab size of the base model is different from the vocab size of the finetuned model + if ( + vocab_size + and model_id + and has_remote_config + and (vocab_size != model.config.__class__.from_pretrained(model_id).vocab_size) + ): + warnings.warn( + "Setting `save_embedding_layers` to `True` as the embedding layer has been resized during finetuning." + ) + save_embedding_layers = True + else: + save_embedding_layers = False + + if save_embedding_layers and hasattr(model, "get_input_embeddings"): + for layer in [model.get_input_embeddings(), model.get_output_embeddings()]: + if not is_embedding_in_target_modules or has_valid_embedding_base_layer(layer): + # support from version >= 0.6.2 + embedding_module_name = get_embedding_layer_name(model, layer, is_embedding_in_target_modules) + if embedding_module_name: + to_return.update({k: v for k, v in state_dict.items() if embedding_module_name in k}) + elif save_embedding_layers: + warnings.warn("Could not identify embedding layer(s) because the model is not a πŸ€— transformers model.") + + to_return = {k.replace(f".{adapter_name}", ""): v for k, v in to_return.items()} + return to_return + + +def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default"): + """ + Set the state dict of the Peft model. + + Args: + model ([`PeftModel`]): The Peft model. + peft_model_state_dict (`dict`): The state dict of the Peft model. + """ + config = model.peft_config[adapter_name] + state_dict = {} + if getattr(model, "modules_to_save", None) is not None: + for key, value in peft_model_state_dict.items(): + if any(module_name in key for module_name in model.modules_to_save): + for module_name in model.modules_to_save: + if module_name in key: + key = key.replace(module_name, f"{module_name}.modules_to_save.{adapter_name}") + break + state_dict[key] = value + else: + state_dict = peft_model_state_dict + + if config.peft_type in ( + PeftType.LORA, + PeftType.LOHA, + PeftType.LOKR, + PeftType.ADALORA, + PeftType.IA3, + PeftType.OFT, + PeftType.POLY, + ): + peft_model_state_dict = {} + parameter_prefix = { + PeftType.IA3: "ia3_", + PeftType.LORA: "lora_", + PeftType.ADALORA: "lora_", + PeftType.LOHA: "hada_", + PeftType.LOKR: "lokr_", + PeftType.OFT: "oft_", + PeftType.POLY: "poly_", + }[config.peft_type] + for k, v in state_dict.items(): + if parameter_prefix in k: + suffix = k.split(parameter_prefix)[1] + if "." in suffix: + suffix_to_replace = ".".join(suffix.split(".")[1:]) + k = k.replace(suffix_to_replace, f"{adapter_name}.{suffix_to_replace}") + else: + k = f"{k}.{adapter_name}" + peft_model_state_dict[k] = v + else: + peft_model_state_dict[k] = v + if config.peft_type == PeftType.ADALORA: + rank_pattern = config.rank_pattern + if rank_pattern is not None: + model.resize_modules_by_rank_pattern(rank_pattern, adapter_name) + elif config.is_prompt_learning or config.peft_type == PeftType.ADAPTION_PROMPT: + peft_model_state_dict = state_dict + else: + raise NotImplementedError + + load_result = model.load_state_dict(peft_model_state_dict, strict=False) + if config.is_prompt_learning: + model.prompt_encoder[adapter_name].embedding.load_state_dict( + {"weight": peft_model_state_dict["prompt_embeddings"]}, strict=True + ) + + if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: + model.prompt_encoder[adapter_name].load_state_dict(peft_model_state_dict, strict=False) + return load_result + + +def load_peft_weights(model_id: str, device: Optional[str] = None, **hf_hub_download_kwargs) -> dict: + r""" + A helper method to load the PEFT weights from the HuggingFace Hub or locally + + Args: + model_id (`str`): + The local path to the adapter weights or the name of the adapter to load from the HuggingFace Hub. + device (`str`): + The device to load the weights onto. + hf_hub_download_kwargs (`dict`): + Additional arguments to pass to the `hf_hub_download` method when loading from the HuggingFace Hub. + """ + path = ( + os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) + if hf_hub_download_kwargs.get("subfolder", None) is not None + else model_id + ) + + if device is None: + device = infer_device() + + if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)): + filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME) + use_safetensors = True + elif os.path.exists(os.path.join(path, WEIGHTS_NAME)): + filename = os.path.join(path, WEIGHTS_NAME) + use_safetensors = False + else: + token = hf_hub_download_kwargs.get("token", None) + if token is None: + token = hf_hub_download_kwargs.get("use_auth_token", None) + + hub_filename = ( + os.path.join(hf_hub_download_kwargs["subfolder"], SAFETENSORS_WEIGHTS_NAME) + if hf_hub_download_kwargs.get("subfolder", None) is not None + else SAFETENSORS_WEIGHTS_NAME + ) + has_remote_safetensors_file = file_exists( + repo_id=model_id, + filename=hub_filename, + revision=hf_hub_download_kwargs.get("revision", None), + repo_type=hf_hub_download_kwargs.get("repo_type", None), + token=token, + ) + use_safetensors = has_remote_safetensors_file + + if has_remote_safetensors_file: + # Priority 1: load safetensors weights + filename = hf_hub_download( + model_id, + SAFETENSORS_WEIGHTS_NAME, + **hf_hub_download_kwargs, + ) + else: + try: + filename = hf_hub_download(model_id, WEIGHTS_NAME, **hf_hub_download_kwargs) + except EntryNotFoundError: + raise ValueError( + f"Can't find weights for {model_id} in {model_id} or in the Hugging Face Hub. " + f"Please check that the file {WEIGHTS_NAME} or {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}." + ) + + if use_safetensors: + if hasattr(torch.backends, "mps") and (device == torch.device("mps")): + adapters_weights = safe_load_file(filename, device="cpu") + else: + adapters_weights = safe_load_file(filename, device=device) + else: + adapters_weights = torch.load(filename, map_location=torch.device(device)) + + return adapters_weights diff --git a/MoRA/requirements.txt b/MoRA/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..efecf06be3ec2d075848d78c76ca12e1ff99f8d4 --- /dev/null +++ b/MoRA/requirements.txt @@ -0,0 +1,7 @@ +transformers==4.37.2 +bitsandbytes==0.39.0 +deepspeed==0.11.1 +accelerate @ git+https://github.com/huggingface/accelerate.git@bd72a5f1a80d5146554458823f8aeda0a9db5297 +fire==0.6.0 +tqdm==4.62.3 +torch==2.1.1+cu121 diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6ced7caa793eae17b5794e39e60064a550a05aab --- /dev/null +++ b/README.md @@ -0,0 +1,93 @@ +--- +base_model: Gunulhona/Gemma-System-9B +library_name: peft +--- + +# Gemma-System-9B with MoRA + SimPO + + +This is a SimPO finetuned version of Gemma-System-9B using MoRA (Mixture of Rank Adaptation) for preference alignment. The model is trained to better align with human preferences through direct preference optimization. + +## Model Details + +### Model Description + + +This model is a finetuned version of Gemma-System-9B using SimPO (Simple Preference Optimization) training method. The model uses MoRA adaptation with rank 256 to efficiently finetune the base model while maintaining its core capabilities. + +- **Developed by:** [Original: Merged Gemma-2-9B-it, Finetuned: Gunulhona] +- **Model type:** Causal Language Model with MoRA adaptation +- **Language(s):** Primarily English and Korean +- **License:** Same as base model (Gemma-System-9B) +- **Finetuned from model:** Gunulhona/Gemma-System-9B + +## Training Details + +### Training Procedure + +#### Training Hyperparameters + +- **Training regime:** bfloat16 mixed precision +- **Learning rate:** 5e-7 +- **Batch size per device:** 1 +- **Gradient accumulation steps:** 16 +- **Total batch size:** 16 +- **Number of epochs:** 200 +- **Optimizer:** AdamW with cosine restarts scheduler +- **Loss type:** SimPO (configurable) +- **Beta (SimPO):** 10.0 +- **SimPO gamma:** 0.5 +- **Maximum sequence length:** 65,536 tokens + +#### MoRA Configuration +- **Rank (r):** 256 +- **Alpha:** 16 +- **Dropout:** 0.05 +- **MoRA type:** 6 +- **Target modules:** + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - down_proj + - up_proj + +### Training Data + +The model was trained on the "Gunulhona/open_dpo_merged" dataset, which contains pairs of preferred and non-preferred responses for preference learning. + +## Technical Specifications + +### Model Architecture and Objective + +The model uses MoRA (Mixture of Rank Adaptation) for efficient parameter-efficient finetuning. It can be trained using either DPO or SimPO objectives: + +- **SimPO:** Simple Preference Optimization with Ξ²=10.0 and Ξ³=0.5 + +### Compute Infrastructure + +#### Hardware +- Training performed on CUDA-capable GPUs +- Uses DeepSpeed for distributed training +- Gradient checkpointing enabled for memory efficiency + +#### Software +- PEFT library for parameter-efficient finetuning +- Transformers library +- DeepSpeed for training optimization +- Weights & Biases for experiment tracking + +## Environmental Impact + +- **Hardware Type:** NVIDIA GPUs +- **Training Regime:** Mixed BF16 precision +- **Optimization:** DeepSpeed + Gradient Checkpointing + +## Model Card Contact + +For questions about this model, please contact Gunulhona. + +### Framework versions + +- [PEFT 0.9.0](https://github.com/kongds/MoRA) diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..de716385a591ce0998c73380ef6942cb51953a99 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,39 @@ +{ + "alpha_pattern": {}, + "auto_mapping": { + "AutoConfig": "MoRA.config.MoRAConfig", + "AutoModel": "MoRA.model.MoRAModel", + "AutoModelForCausalLM": "MoRA.model.MoRAModelForCausalLM" + }, + "base_model_name_or_path": "Gunulhona/Gemma-System-9B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "mora_type": 6, + "peft_type": "LORA", + "r": 256, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "k_proj", + "up_proj", + "o_proj", + "gate_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_mora": true, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9ba1e99ebec7e911cd328a3fe8805d840ab1546f --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62f88bb8a58b8585a6eb9bc6ee5b1e31ae56c4cb87266681b6049daa50e13545 +size 6913516904 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..8d6368f7e735fbe4781bf6e956b7c6ad0586df80 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,34 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0 +size 34356041 diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..45db2b9123c19dd338c31a1403c77d3acb98b846 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,1757 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] not in ['user', 'assistant', 'system'] or (loop.index0 > 0 and message['role'] != 'system' and message['role'] == messages[loop.index0 - 1]['role']) %}{{ raise_exception('Invalid role or role sequence') }}{% endif %}{{ '' + message['role'] + '\n' + message['content'] | trim + '\n' if message['role'] != 'system' else message['content'] + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ 'model\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 2048, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +}