diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..52373fe24473b1aa44333d318f578ae6bf04b49b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
diff --git a/MoRA/README.md b/MoRA/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..304b0570eee175015a38d05a10f251068b034154
--- /dev/null
+++ b/MoRA/README.md
@@ -0,0 +1,68 @@
+# [MoRA: High-Rank Updating for Parameter-Efﬁcient Fine-Tuning](https://arxiv.org/abs/2405.12130)
+
+## Setup
+
+We implement MoRA in peft-mora based on HF peft in the [`apply_mora`](https://github.com/kongds/MoRA/blob/main/peft-mora/src/peft/tuners/lora/layer.py#L229) and [`get_delta_weight`](https://github.com/kongds/MoRA/blob/main/peft-mora/src/peft/tuners/lora/layer.py#L514).
+``` sh
+pip install -e ./peft-mora
+```
+
+After installation, it can be used like
+
+``` python
+from peft import LoraConfig, get_peft_model
+config = LoraConfig(
+    # enable MoRA
+    use_mora=True,
+    # type 1 (Sharing) for large lora ranks, Eq. 6 in paper
+    # type 6 (RoPE based) for small lora ranks, Eq. 9 in paper
+    mora_type=6,
+    # lora rank here, we will calculate corresponding $\hat{r}$ in MoRA
+    r=lora_r,
+    # MoRA does not use lora_alpha
+    # lora_alpha=lora_alpha,
+    target_modules=lora_target_modules,
+    lora_dropout=lora_dropout,
+    task_type="CAUSAL_LM",
+    **kwargs,
+)
+model = get_peft_model(model, config)
+
+# training here...
+
+# can be merged into model via `merge_and_unload` like LoRA
+model = model.merge_and_unload() 
+```
+
+## Examples
+### fine-tuning MetaMath with MoRA
+
+``` sh
+RANK=8
+deepspeed --num_gpus=8 --num_nodes=2 train.py \
+           --base_model <LLAMA-2> --micro_batch_size 4\
+            --wandb_run_name mora_math_r8 --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,down_proj,up_proj \
+            --num_epochs 3 --deepspeed ds.config --wandb_project lora-math --lora_r $RANK --batch_size 128 \
+            --data_path meta-math/MetaMath \
+            --save_steps 3000 \
+            --learning_rate 3e-4 --mora_type 6 \
+            --logging_steps 5  --use_bf16  --use_16bit --use_mora 
+```
+
+### pretraining
+
+``` sh
+deepspeed --num_gpus=8 --num_nodes=4 train.py \
+        --micro_batch_size 16 --wandb_run_name mora-pretrain250m-r128 \
+        --num_epochs 1 --wandb_project lora-pretrain --batch_size 1024 \
+        --data_path <processed C4> --logging_steps 1 \
+        --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,down_proj,up_proj \
+        --lora_r 128 --lora_alpha 64 --warmup_steps 1000  \
+        --force_tqdm_update --lr_scheduler_type cosine \
+        --max_steps 10000 --pretrain 250m \
+        --train_embhead --learning_rate 5e-4 \
+        --use_mora --use_relora --use_relora_step 2000  # ReMoRA merge per 2000 steps 
+```
+
+## Acknowledgement
+Our Code is based on peft, alpaca-lora and ReLoRA
diff --git a/MoRA/config.py b/MoRA/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c08d8cc8d2f3eb2034e42e044cb82c5b163d8c8f
--- /dev/null
+++ b/MoRA/config.py
@@ -0,0 +1,4 @@
+from peft_mora import LoraConfig
+
+class MoRAModelForCausalLM(LoraConfig):
+    pass
diff --git a/MoRA/model.py b/MoRA/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..40c9e946d28be7517eb5fb6f20a0a795f4af0d32
--- /dev/null
+++ b/MoRA/model.py
@@ -0,0 +1,4 @@
+from peft_mora import PeftModelForCausalLM
+
+class MoRAModelForCausalLM(PeftModelForCausalLM):
+    pass
diff --git a/MoRA/peft_mora/__init__.py b/MoRA/peft_mora/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a197236800e64ebe412319e0b490e29f0136ba16
--- /dev/null
+++ b/MoRA/peft_mora/__init__.py
@@ -0,0 +1,90 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "0.9.0"
+
+from .auto import (
+    AutoPeftModel,
+    AutoPeftModelForCausalLM,
+    AutoPeftModelForSequenceClassification,
+    AutoPeftModelForSeq2SeqLM,
+    AutoPeftModelForTokenClassification,
+    AutoPeftModelForQuestionAnswering,
+    AutoPeftModelForFeatureExtraction,
+)
+from .mapping import (
+    MODEL_TYPE_TO_PEFT_MODEL_MAPPING,
+    PEFT_TYPE_TO_CONFIG_MAPPING,
+    get_peft_config,
+    get_peft_model,
+    inject_adapter_in_model,
+)
+from .mixed_model import PeftMixedModel
+from .peft_model import (
+    PeftModel,
+    PeftModelForCausalLM,
+    PeftModelForSeq2SeqLM,
+    PeftModelForSequenceClassification,
+    PeftModelForTokenClassification,
+    PeftModelForQuestionAnswering,
+    PeftModelForFeatureExtraction,
+)
+from .tuners import (
+    AdaptionPromptConfig,
+    AdaptionPromptModel,
+    LoraConfig,
+    LoftQConfig,
+    LoraModel,
+    LoHaConfig,
+    LoHaModel,
+    LoKrConfig,
+    LoKrModel,
+    IA3Config,
+    IA3Model,
+    AdaLoraConfig,
+    AdaLoraModel,
+    PrefixEncoder,
+    PrefixTuningConfig,
+    PromptEmbedding,
+    PromptEncoder,
+    PromptEncoderConfig,
+    PromptEncoderReparameterizationType,
+    PromptTuningConfig,
+    PromptTuningInit,
+    MultitaskPromptTuningConfig,
+    MultitaskPromptTuningInit,
+    OFTConfig,
+    OFTModel,
+    PolyConfig,
+    PolyModel,
+)
+from .utils import (
+    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
+    PeftType,
+    TaskType,
+    bloom_model_postprocess_past_key_value,
+    get_peft_model_state_dict,
+    prepare_model_for_int8_training,
+    prepare_model_for_kbit_training,
+    set_peft_model_state_dict,
+    shift_tokens_right,
+    load_peft_weights,
+    cast_mixed_precision_params,
+)
+from .config import PeftConfig, PromptLearningConfig
diff --git a/MoRA/peft_mora/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6f661d01e55501c00f3a27413836720bd25ae04
Binary files /dev/null and b/MoRA/peft_mora/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/__pycache__/auto.cpython-312.pyc b/MoRA/peft_mora/__pycache__/auto.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05dd40265a4c1a6a9ef5a8728b99513281f9f7f8
Binary files /dev/null and b/MoRA/peft_mora/__pycache__/auto.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..017717abf1965828f86eda40cb8743bd6856058d
Binary files /dev/null and b/MoRA/peft_mora/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/__pycache__/import_utils.cpython-312.pyc b/MoRA/peft_mora/__pycache__/import_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d46c04d6e509a9e1879bbbf2f6c4dcf1a7dbd7f9
Binary files /dev/null and b/MoRA/peft_mora/__pycache__/import_utils.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/__pycache__/mapping.cpython-312.pyc b/MoRA/peft_mora/__pycache__/mapping.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..822bb5fd4572d22e2cff40c2cfdd4a89afb83f7b
Binary files /dev/null and b/MoRA/peft_mora/__pycache__/mapping.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/__pycache__/mixed_model.cpython-312.pyc b/MoRA/peft_mora/__pycache__/mixed_model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ca9182613197a95d93f92edc8d6215ff49cd592
Binary files /dev/null and b/MoRA/peft_mora/__pycache__/mixed_model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/__pycache__/peft_model.cpython-312.pyc b/MoRA/peft_mora/__pycache__/peft_model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae7ede202c501ebbb6baa3ba323770186865f9c4
Binary files /dev/null and b/MoRA/peft_mora/__pycache__/peft_model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/auto.py b/MoRA/peft_mora/auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..353c9e2f84c48bd61194102da6d6e83dfdcd42db
--- /dev/null
+++ b/MoRA/peft_mora/auto.py
@@ -0,0 +1,170 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import importlib
+import os
+from typing import Optional
+
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForQuestionAnswering,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+)
+
+from .config import PeftConfig
+from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
+from .peft_model import (
+    PeftModel,
+    PeftModelForCausalLM,
+    PeftModelForFeatureExtraction,
+    PeftModelForQuestionAnswering,
+    PeftModelForSeq2SeqLM,
+    PeftModelForSequenceClassification,
+    PeftModelForTokenClassification,
+)
+from .utils.constants import TOKENIZER_CONFIG_NAME
+from .utils.other import check_file_exists_on_hf_hub
+
+
+class _BaseAutoPeftModel:
+    _target_class = None
+    _target_peft_class = None
+
+    def __init__(self, *args, **kwargs):
+        # For consistency with transformers: https://github.com/huggingface/transformers/blob/91d7df58b6537d385e90578dac40204cb550f706/src/transformers/models/auto/auto_factory.py#L400
+        raise EnvironmentError(  # noqa: UP024
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_config(config)` methods."
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        adapter_name: str = "default",
+        is_trainable: bool = False,
+        config: Optional[PeftConfig] = None,
+        **kwargs,
+    ):
+        r"""
+        A wrapper around all the preprocessing steps a user needs to perform in order to load a PEFT model. The kwargs
+        are passed along to `PeftConfig` that automatically takes care of filtering the kwargs of the Hub methods and
+        the config object init.
+        """
+        peft_config = PeftConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        base_model_path = peft_config.base_model_name_or_path
+
+        task_type = getattr(peft_config, "task_type", None)
+
+        if cls._target_class is not None:
+            target_class = cls._target_class
+        elif cls._target_class is None and task_type is not None:
+            # this is only in the case where we use `AutoPeftModel`
+            raise ValueError(
+                "Cannot use `AutoPeftModel` with a task type, please use a specific class for your task type. (e.g. `AutoPeftModelForCausalLM` for `task_type='CAUSAL_LM'`)"
+            )
+
+        if task_type is not None:
+            expected_target_class = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[task_type]
+            if cls._target_peft_class.__name__ != expected_target_class.__name__:
+                raise ValueError(
+                    f"Expected target PEFT class: {expected_target_class.__name__}, but you have asked for: {cls._target_peft_class.__name__ }"
+                    " make sure that you are loading the correct model for your task type."
+                )
+        elif task_type is None and getattr(peft_config, "auto_mapping", None) is not None:
+            auto_mapping = getattr(peft_config, "auto_mapping", None)
+            base_model_class = auto_mapping["base_model_class"]
+            parent_library_name = auto_mapping["parent_library"]
+
+            parent_library = importlib.import_module(parent_library_name)
+            target_class = getattr(parent_library, base_model_class)
+        else:
+            raise ValueError(
+                "Cannot infer the auto class from the config, please make sure that you are loading the correct model for your task type."
+            )
+
+        base_model = target_class.from_pretrained(base_model_path, **kwargs)
+
+        tokenizer_exists = False
+        if os.path.exists(os.path.join(pretrained_model_name_or_path, TOKENIZER_CONFIG_NAME)):
+            tokenizer_exists = True
+        else:
+            token = kwargs.get("token", None)
+            if token is None:
+                token = kwargs.get("use_auth_token", None)
+
+            tokenizer_exists = check_file_exists_on_hf_hub(
+                repo_id=pretrained_model_name_or_path,
+                filename=TOKENIZER_CONFIG_NAME,
+                revision=kwargs.get("revision", None),
+                repo_type=kwargs.get("repo_type", None),
+                token=token,
+            )
+
+        if tokenizer_exists:
+            tokenizer = AutoTokenizer.from_pretrained(
+                pretrained_model_name_or_path, trust_remote_code=kwargs.get("trust_remote_code", False)
+            )
+            base_model.resize_token_embeddings(len(tokenizer))
+
+        return cls._target_peft_class.from_pretrained(
+            base_model,
+            pretrained_model_name_or_path,
+            adapter_name=adapter_name,
+            is_trainable=is_trainable,
+            config=config,
+            **kwargs,
+        )
+
+
+class AutoPeftModel(_BaseAutoPeftModel):
+    _target_class = None
+    _target_peft_class = PeftModel
+
+
+class AutoPeftModelForCausalLM(_BaseAutoPeftModel):
+    _target_class = AutoModelForCausalLM
+    _target_peft_class = PeftModelForCausalLM
+
+
+class AutoPeftModelForSeq2SeqLM(_BaseAutoPeftModel):
+    _target_class = AutoModelForSeq2SeqLM
+    _target_peft_class = PeftModelForSeq2SeqLM
+
+
+class AutoPeftModelForSequenceClassification(_BaseAutoPeftModel):
+    _target_class = AutoModelForSequenceClassification
+    _target_peft_class = PeftModelForSequenceClassification
+
+
+class AutoPeftModelForTokenClassification(_BaseAutoPeftModel):
+    _target_class = AutoModelForTokenClassification
+    _target_peft_class = PeftModelForTokenClassification
+
+
+class AutoPeftModelForQuestionAnswering(_BaseAutoPeftModel):
+    _target_class = AutoModelForQuestionAnswering
+    _target_peft_class = PeftModelForQuestionAnswering
+
+
+class AutoPeftModelForFeatureExtraction(_BaseAutoPeftModel):
+    _target_class = AutoModel
+    _target_peft_class = PeftModelForFeatureExtraction
diff --git a/MoRA/peft_mora/config.py b/MoRA/peft_mora/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..83eb047f3f62112e02030149e3b4a496cee1555c
--- /dev/null
+++ b/MoRA/peft_mora/config.py
@@ -0,0 +1,270 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import json
+import os
+from dataclasses import asdict, dataclass, field
+from typing import Dict, Optional, Union
+
+from huggingface_hub import hf_hub_download
+from transformers.utils import PushToHubMixin
+
+from .utils import CONFIG_NAME, PeftType, TaskType
+
+
+@dataclass
+class PeftConfigMixin(PushToHubMixin):
+    r"""
+    This is the base configuration class for PEFT adapter models. It contains all the methods that are common to all
+    PEFT adapter models. This class inherits from [`~transformers.utils.PushToHubMixin`] which contains the methods to
+    push your model to the Hub. The method `save_pretrained` will save the configuration of your adapter model in a
+    directory. The method `from_pretrained` will load the configuration of your adapter model from a directory.
+
+    Args:
+        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
+    """
+
+    peft_type: Optional[PeftType] = field(default=None, metadata={"help": "The type of PEFT model."})
+    auto_mapping: Optional[dict] = field(
+        default=None, metadata={"help": "An auto mapping dict to help retrieve the base model class if needed."}
+    )
+
+    def to_dict(self) -> Dict:
+        r"""
+        Returns the configuration for your adapter model as a dictionary.
+        """
+        return asdict(self)
+
+    def save_pretrained(self, save_directory: str, **kwargs) -> None:
+        r"""
+        This method saves the configuration of your adapter model in a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory where the configuration will be saved.
+            kwargs (additional keyword arguments, *optional*):
+                Additional keyword arguments passed along to the [`~transformers.utils.PushToHubMixin.push_to_hub`]
+                method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+        auto_mapping_dict = kwargs.pop("auto_mapping_dict", None)
+
+        output_dict = asdict(self)
+        # converting set type to list
+        for key, value in output_dict.items():
+            if isinstance(value, set):
+                output_dict[key] = list(value)
+
+        output_path = os.path.join(save_directory, CONFIG_NAME)
+
+        # Add auto mapping details for custom models.
+        if auto_mapping_dict is not None:
+            output_dict["auto_mapping"] = auto_mapping_dict
+
+        # save it
+        with open(output_path, "w") as writer:
+            writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
+
+    @classmethod
+    def from_peft_type(cls, **kwargs):
+        r"""
+        This method loads the configuration of your adapter model from a set of kwargs.
+
+        The appropriate configuration type is determined by the `peft_type` argument. If `peft_type` is not provided,
+        the calling class type is instantiated.
+
+        Args:
+            kwargs (configuration keyword arguments):
+                Keyword arguments passed along to the configuration initialization.
+        """
+        # Avoid circular dependency .. TODO: fix this with a larger refactor
+        from peft_mora.mapping import PEFT_TYPE_TO_CONFIG_MAPPING
+
+        # TODO: this hack is needed to fix the following issue (on commit 702f937):
+        # if someone saves a default config and loads it back with `PeftConfig` class it yields to
+        # not loading the correct config class.
+
+        # from peft import AdaLoraConfig, PeftConfig
+        # peft_config = AdaLoraConfig()
+        # print(peft_config)
+        # >>> AdaLoraConfig(peft_type=<PeftType.ADALORA: 'ADALORA'>, auto_mapping=None, base_model_name_or_path=None,
+        # revision=None, task_type=None, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, ...
+        #
+        # peft_config.save_pretrained("./test_config")
+        # peft_config = PeftConfig.from_pretrained("./test_config")
+        # print(peft_config)
+        # >>> PeftConfig(peft_type='ADALORA', auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False)
+
+        if "peft_type" in kwargs:
+            peft_type = kwargs["peft_type"]
+            config_cls = PEFT_TYPE_TO_CONFIG_MAPPING[peft_type]
+        else:
+            config_cls = cls
+
+        return config_cls(**kwargs)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, subfolder: Optional[str] = None, **kwargs):
+        r"""
+        This method loads the configuration of your adapter model from a directory.
+
+        Args:
+            pretrained_model_name_or_path (`str`):
+                The directory or the Hub repository id where the configuration is saved.
+            kwargs (additional keyword arguments, *optional*):
+                Additional keyword arguments passed along to the child class initialization.
+        """
+        path = (
+            os.path.join(pretrained_model_name_or_path, subfolder)
+            if subfolder is not None
+            else pretrained_model_name_or_path
+        )
+
+        hf_hub_download_kwargs, class_kwargs, _ = cls._split_kwargs(kwargs)
+
+        if os.path.isfile(os.path.join(path, CONFIG_NAME)):
+            config_file = os.path.join(path, CONFIG_NAME)
+        else:
+            try:
+                config_file = hf_hub_download(
+                    pretrained_model_name_or_path, CONFIG_NAME, subfolder=subfolder, **hf_hub_download_kwargs
+                )
+            except Exception:
+                raise ValueError(f"Can't find '{CONFIG_NAME}' at '{pretrained_model_name_or_path}'")
+
+        loaded_attributes = cls.from_json_file(config_file)
+        kwargs = {**class_kwargs, **loaded_attributes}
+        return cls.from_peft_type(**kwargs)
+
+    @classmethod
+    def from_json_file(cls, path_json_file: str, **kwargs):
+        r"""
+        Loads a configuration file from a json file.
+
+        Args:
+            path_json_file (`str`):
+                The path to the json file.
+        """
+        with open(path_json_file) as file:
+            json_object = json.load(file)
+
+        return json_object
+
+    @classmethod
+    def _split_kwargs(cls, kwargs):
+        hf_hub_download_kwargs = {}
+        class_kwargs = {}
+        other_kwargs = {}
+
+        for key, value in kwargs.items():
+            if key in inspect.signature(hf_hub_download).parameters:
+                hf_hub_download_kwargs[key] = value
+            elif key in list(cls.__annotations__):
+                class_kwargs[key] = value
+            else:
+                other_kwargs[key] = value
+
+        return hf_hub_download_kwargs, class_kwargs, other_kwargs
+
+    @classmethod
+    def _get_peft_type(
+        cls,
+        model_id: str,
+        **hf_hub_download_kwargs,
+    ):
+        subfolder = hf_hub_download_kwargs.get("subfolder", None)
+
+        path = os.path.join(model_id, subfolder) if subfolder is not None else model_id
+
+        if os.path.isfile(os.path.join(path, CONFIG_NAME)):
+            config_file = os.path.join(path, CONFIG_NAME)
+        else:
+            try:
+                config_file = hf_hub_download(
+                    model_id,
+                    CONFIG_NAME,
+                    **hf_hub_download_kwargs,
+                )
+            except Exception:
+                raise ValueError(f"Can't find '{CONFIG_NAME}' at '{model_id}'")
+
+        loaded_attributes = cls.from_json_file(config_file)
+        return loaded_attributes["peft_type"]
+
+    @property
+    def is_prompt_learning(self) -> bool:
+        r"""
+        Utility method to check if the configuration is for prompt learning.
+        """
+        return False
+
+    @property
+    def is_adaption_prompt(self) -> bool:
+        """Return True if this is an adaption prompt config."""
+        return False
+
+
+@dataclass
+class PeftConfig(PeftConfigMixin):
+    """
+    This is the base configuration class to store the configuration of a [`PeftModel`].
+
+    Args:
+        peft_type (Union[[`~peft.utils.config.PeftType`], `str`]): The type of Peft method to use.
+        task_type (Union[[`~peft.utils.config.TaskType`], `str`]): The type of task to perform.
+        inference_mode (`bool`, defaults to `False`): Whether to use the Peft model in inference mode.
+    """
+
+    base_model_name_or_path: Optional[str] = field(
+        default=None, metadata={"help": "The name of the base model to use."}
+    )
+    revision: Optional[str] = field(default=None, metadata={"help": "The specific model version to use."})
+    peft_type: Optional[Union[str, PeftType]] = field(default=None, metadata={"help": "Peft type"})
+    task_type: Optional[Union[str, TaskType]] = field(default=None, metadata={"help": "Task type"})
+    inference_mode: bool = field(default=False, metadata={"help": "Whether to use inference mode"})
+
+
+@dataclass
+class PromptLearningConfig(PeftConfig):
+    """
+    This is the base configuration class to store the configuration of [`PrefixTuning`], [`PromptEncoder`], or
+    [`PromptTuning`].
+
+    Args:
+        num_virtual_tokens (`int`): The number of virtual tokens to use.
+        token_dim (`int`): The hidden embedding dimension of the base transformer model.
+        num_transformer_submodules (`int`): The number of transformer submodules in the base transformer model.
+        num_attention_heads (`int`): The number of attention heads in the base transformer model.
+        num_layers (`int`): The number of layers in the base transformer model.
+    """
+
+    num_virtual_tokens: int = field(default=None, metadata={"help": "Number of virtual tokens"})
+    token_dim: int = field(
+        default=None, metadata={"help": "The hidden embedding dimension of the base transformer model"}
+    )
+    num_transformer_submodules: Optional[int] = field(
+        default=None, metadata={"help": "Number of transformer submodules"}
+    )
+    num_attention_heads: Optional[int] = field(default=None, metadata={"help": "Number of attention heads"})
+    num_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer layers"})
+
+    @property
+    def is_prompt_learning(self) -> bool:
+        r"""
+        Utility method to check if the configuration is for prompt learning.
+        """
+        return True
diff --git a/MoRA/peft_mora/helpers.py b/MoRA/peft_mora/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..8875ff7fc493ae4dfff11a1d8e4485b330cb27dc
--- /dev/null
+++ b/MoRA/peft_mora/helpers.py
@@ -0,0 +1,113 @@
+import inspect
+from copy import deepcopy
+from functools import update_wrapper
+from types import MethodType
+
+from .peft_model import PeftModel
+
+
+def update_forward_signature(model: PeftModel) -> None:
+    """
+    Args:
+    Updates the forward signature of the PeftModel to include parents class signature
+        model (`PeftModel`): Peft model to update the forward signature
+    Example:
+
+    ```python
+    >>> from transformers import WhisperForConditionalGeneration
+    >>> from peft import get_peft_model, LoraConfig, update_forward_signature
+
+    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+    >>> peft_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj", "v_proj"])
+
+    >>> peft_model = get_peft_model(model, peft_config)
+    >>> update_forward_signature(peft_model)
+    ```
+    """
+
+    # Only update signature when the current forward signature only has *args and **kwargs
+    current_signature = inspect.signature(model.forward)
+    if (
+        len(current_signature.parameters) == 2
+        and "args" in current_signature.parameters
+        and "kwargs" in current_signature.parameters
+    ):
+        forward = deepcopy(model.forward.__func__)
+        update_wrapper(
+            forward, type(model.get_base_model()).forward, assigned=("__doc__", "__name__", "__annotations__")
+        )
+        model.forward = MethodType(forward, model)
+
+
+def update_generate_signature(model: PeftModel) -> None:
+    """
+    Args:
+    Updates the generate signature of a PeftModel with overriding generate to include parents class signature
+        model (`PeftModel`): Peft model to update the generate signature
+    Example:
+
+    ```python
+    >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+    >>> from peft import get_peft_model, LoraConfig, TaskType, update_generate_signature
+
+    >>> model_name_or_path = "bigscience/mt0-large"
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+
+    >>> peft_config = LoraConfig(
+    ...     task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
+    ... )
+    >>> peft_model = get_peft_model(model, peft_config)
+    >>> update_generate_signature(peft_model)
+    >>> help(peft_model.generate)
+    ```
+    """
+    if not hasattr(model, "generate"):
+        return
+    current_signature = inspect.signature(model.generate)
+    if (
+        len(current_signature.parameters) == 2
+        and "args" in current_signature.parameters
+        and "kwargs" in current_signature.parameters
+    ) or (len(current_signature.parameters) == 1 and "kwargs" in current_signature.parameters):
+        generate = deepcopy(model.generate.__func__)
+        update_wrapper(
+            generate,
+            type(model.get_base_model()).generate,
+            assigned=("__doc__", "__name__", "__annotations__"),
+        )
+        model.generate = MethodType(generate, model)
+
+
+def update_signature(model: PeftModel, method: str = "all") -> None:
+    """
+    Args:
+    Updates the signature of a PeftModel include parents class signature for forward or generate method
+        model (`PeftModel`): Peft model to update generate or forward signature method (`str`): method to update
+        signature choose one of "forward", "generate", "all"
+    Example:
+     ```python
+    >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+    >>> from peft import get_peft_model, LoraConfig, TaskType, update_signature
+
+    >>> model_name_or_path = "bigscience/mt0-large"
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
+
+    >>> peft_config = LoraConfig(
+    ...     task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
+    ... )
+    >>> peft_model = get_peft_model(model, peft_config)
+    >>> update_signature(peft_model)
+    >>> help(peft_model.generate)
+    ```
+    """
+    if method == "forward":
+        update_forward_signature(model)
+    elif method == "generate":
+        update_generate_signature(model)
+    elif method == "all":
+        update_forward_signature(model)
+        update_generate_signature(model)
+    else:
+        raise ValueError(f"method {method} is not supported please choose one of ['forward', 'generate', 'all']")
diff --git a/MoRA/peft_mora/import_utils.py b/MoRA/peft_mora/import_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c32d96d52e74bd5de879c06c732fbf82417a8b6
--- /dev/null
+++ b/MoRA/peft_mora/import_utils.py
@@ -0,0 +1,73 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import importlib.metadata as importlib_metadata
+from functools import lru_cache
+
+import packaging.version
+
+
+def is_bnb_available() -> bool:
+    return importlib.util.find_spec("bitsandbytes") is not None
+
+
+def is_bnb_4bit_available() -> bool:
+    if not is_bnb_available():
+        return False
+
+    import bitsandbytes as bnb
+
+    return hasattr(bnb.nn, "Linear4bit")
+
+
+def is_auto_gptq_available():
+    if importlib.util.find_spec("auto_gptq") is not None:
+        AUTOGPTQ_MINIMUM_VERSION = packaging.version.parse("0.5.0")
+        version_autogptq = packaging.version.parse(importlib_metadata.version("auto_gptq"))
+        if AUTOGPTQ_MINIMUM_VERSION <= version_autogptq:
+            return True
+        else:
+            raise ImportError(
+                f"Found an incompatible version of auto-gptq. Found version {version_autogptq}, "
+                f"but only versions above {AUTOGPTQ_MINIMUM_VERSION} are supported"
+            )
+
+
+def is_optimum_available() -> bool:
+    return importlib.util.find_spec("optimum") is not None
+
+
+@lru_cache
+def is_torch_tpu_available(check_device=True):
+    "Checks if `torch_xla` is installed and potentially if a TPU is in the environment"
+    if importlib.util.find_spec("torch_xla") is not None:
+        if check_device:
+            # We need to check if `xla_device` can be found, will raise a RuntimeError if not
+            try:
+                import torch_xla.core.xla_model as xm
+
+                _ = xm.xla_device()
+                return True
+            except RuntimeError:
+                return False
+        return True
+    return False
+
+
+def is_aqlm_available():
+    return importlib.util.find_spec("aqlm") is not None
+
+
+def is_auto_awq_available():
+    return importlib.util.find_spec("awq") is not None
diff --git a/MoRA/peft_mora/mapping.py b/MoRA/peft_mora/mapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..b62ddf94aafa1a32b2711c0a6e365900065a93b4
--- /dev/null
+++ b/MoRA/peft_mora/mapping.py
@@ -0,0 +1,168 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import torch
+
+from .config import PeftConfig
+from .mixed_model import PeftMixedModel
+from .peft_model import (
+    PeftModel,
+    PeftModelForCausalLM,
+    PeftModelForFeatureExtraction,
+    PeftModelForQuestionAnswering,
+    PeftModelForSeq2SeqLM,
+    PeftModelForSequenceClassification,
+    PeftModelForTokenClassification,
+)
+from .tuners import (
+    AdaLoraConfig,
+    AdaLoraModel,
+    AdaptionPromptConfig,
+    IA3Config,
+    IA3Model,
+    LoHaConfig,
+    LoHaModel,
+    LoKrConfig,
+    LoKrModel,
+    LoraConfig,
+    LoraModel,
+    MultitaskPromptTuningConfig,
+    OFTConfig,
+    OFTModel,
+    PolyConfig,
+    PolyModel,
+    PrefixTuningConfig,
+    PromptEncoderConfig,
+    PromptTuningConfig,
+)
+from .utils import _prepare_prompt_learning_config
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedModel
+
+
+MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, PeftModel] = {
+    "SEQ_CLS": PeftModelForSequenceClassification,
+    "SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM,
+    "CAUSAL_LM": PeftModelForCausalLM,
+    "TOKEN_CLS": PeftModelForTokenClassification,
+    "QUESTION_ANS": PeftModelForQuestionAnswering,
+    "FEATURE_EXTRACTION": PeftModelForFeatureExtraction,
+}
+
+PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, PeftConfig] = {
+    "ADAPTION_PROMPT": AdaptionPromptConfig,
+    "PROMPT_TUNING": PromptTuningConfig,
+    "PREFIX_TUNING": PrefixTuningConfig,
+    "P_TUNING": PromptEncoderConfig,
+    "LORA": LoraConfig,
+    "LOHA": LoHaConfig,
+    "LOKR": LoKrConfig,
+    "ADALORA": AdaLoraConfig,
+    "IA3": IA3Config,
+    "MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig,
+    "OFT": OFTConfig,
+    "POLY": PolyConfig,
+}
+
+PEFT_TYPE_TO_TUNER_MAPPING = {
+    "LORA": LoraModel,
+    "LOHA": LoHaModel,
+    "LOKR": LoKrModel,
+    "ADALORA": AdaLoraModel,
+    "IA3": IA3Model,
+    "OFT": OFTModel,
+    "POLY": PolyModel,
+}
+
+
+def get_peft_config(config_dict: dict[str, Any]) -> PeftConfig:
+    """
+    Returns a Peft config object from a dictionary.
+
+    Args:
+        config_dict (`Dict[str, Any]`): Dictionary containing the configuration parameters.
+    """
+
+    return PEFT_TYPE_TO_CONFIG_MAPPING[config_dict["peft_type"]](**config_dict)
+
+
+def get_peft_model(
+    model: PreTrainedModel, peft_config: PeftConfig, adapter_name: str = "default", mixed: bool = False
+) -> PeftModel | PeftMixedModel:
+    """
+    Returns a Peft model object from a model and a config.
+
+    Args:
+        model ([`transformers.PreTrainedModel`]):
+            Model to be wrapped.
+        peft_config ([`PeftConfig`]):
+            Configuration object containing the parameters of the Peft model.
+        adapter_name (`str`, `optional`, defaults to `"default"`):
+            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
+        mixed (`bool`, `optional`, defaults to `False`):
+            Whether to allow mixing different (compatible) adapter types.
+    """
+    model_config = getattr(model, "config", {"model_type": "custom"})
+    if hasattr(model_config, "to_dict"):
+        model_config = model_config.to_dict()
+
+    peft_config.base_model_name_or_path = model.__dict__.get("name_or_path", None)
+
+    if mixed:
+        return PeftMixedModel(model, peft_config, adapter_name=adapter_name)
+
+    if peft_config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not peft_config.is_prompt_learning:
+        return PeftModel(model, peft_config, adapter_name=adapter_name)
+
+    if peft_config.is_prompt_learning:
+        peft_config = _prepare_prompt_learning_config(peft_config, model_config)
+    return MODEL_TYPE_TO_PEFT_MODEL_MAPPING[peft_config.task_type](model, peft_config, adapter_name=adapter_name)
+
+
+def inject_adapter_in_model(
+    peft_config: PeftConfig, model: torch.nn.Module, adapter_name: str = "default"
+) -> torch.nn.Module:
+    r"""
+    A simple API to create and inject adapter in-place into a model. Currently the API does not support prompt learning
+    methods and adaption prompt. Make sure to have the correct `target_names` set in the `peft_config` object. The API
+    calls `get_peft_model` under the hood but would be restricted only to non-prompt learning methods.
+
+    Args:
+        peft_config (`PeftConfig`):
+            Configuration object containing the parameters of the Peft model.
+        model (`torch.nn.Module`):
+            The input model where the adapter will be injected.
+        adapter_name (`str`, `optional`, defaults to `"default"`):
+            The name of the adapter to be injected, if not provided, the default adapter name is used ("default").
+    """
+    if peft_config.is_prompt_learning or peft_config.is_adaption_prompt:
+        raise ValueError("`create_and_replace` does not support prompt learning and adaption prompt yet.")
+
+    if peft_config.peft_type not in PEFT_TYPE_TO_TUNER_MAPPING.keys():
+        raise ValueError(
+            f"`inject_adapter_in_model` does not support {peft_config.peft_type} yet. Please use `get_peft_model`."
+        )
+
+    tuner_cls = PEFT_TYPE_TO_TUNER_MAPPING[peft_config.peft_type]
+
+    # By instantiating a peft model we are injecting randomly initialized LoRA layers into the model's modules.
+    peft_model = tuner_cls(model, peft_config, adapter_name=adapter_name)
+
+    return peft_model.model
diff --git a/MoRA/peft_mora/mixed_model.py b/MoRA/peft_mora/mixed_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd0fff0e3497924af536bafbc12d05bff3e5fd79
--- /dev/null
+++ b/MoRA/peft_mora/mixed_model.py
@@ -0,0 +1,402 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import os
+from contextlib import contextmanager
+from typing import Any, Optional, Union
+
+import torch
+from accelerate.hooks import remove_hook_from_submodules
+from torch import nn
+from transformers.utils import PushToHubMixin
+
+from peft_mora.tuners.mixed import COMPATIBLE_TUNER_TYPES
+
+from .config import PeftConfig
+from .peft_model import PeftModel
+from .tuners import (
+    AdaLoraModel,
+    IA3Model,
+    LoHaModel,
+    LoKrModel,
+    LoraModel,
+    MixedModel,
+    OFTModel,
+)
+from .utils import PeftType, _set_adapter, _set_trainable
+
+
+PEFT_TYPE_TO_MODEL_MAPPING = {
+    PeftType.LORA: LoraModel,
+    PeftType.LOHA: LoHaModel,
+    PeftType.LOKR: LoKrModel,
+    PeftType.ADALORA: AdaLoraModel,
+    PeftType.IA3: IA3Model,
+    PeftType.OFT: OFTModel,
+}
+
+
+def _prepare_model_for_gradient_checkpointing(model: nn.Module) -> None:
+    r"""
+    Prepares the model for gradient checkpointing if necessary
+    """
+    # Note: same as PeftModel._prepare_model_for_gradient_checkpointing
+    if not getattr(model, "is_gradient_checkpointing", True):
+        return model
+
+    if not (
+        getattr(model, "is_loaded_in_8bit", False)
+        or getattr(model, "is_loaded_in_4bit", False)
+        or getattr(model, "is_quantized", False)
+    ):
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        elif hasattr(model, "get_input_embeddings"):
+
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+
+def _check_config_compatible(peft_config: PeftConfig) -> None:
+    if peft_config.peft_type not in COMPATIBLE_TUNER_TYPES:
+        raise ValueError(
+            f"The provided `peft_type` '{peft_config.peft_type.value}' is not compatible with the `PeftMixedModel`. "
+            f"Compatible types are: {COMPATIBLE_TUNER_TYPES}"
+        )
+
+
+class PeftMixedModel(PushToHubMixin, torch.nn.Module):
+    """
+    PeftMixedModel for loading mixing different types of adapters for inference.
+
+    This class does not support loading/saving, and it shouldn't usually be initialized directly. Instead, use
+    `get_peft_model` with the argument `mixed=True`.
+
+    <Tip>
+
+    Read the [Mixed adapter types](https://huggingface.co/docs/peft/en/developer_guides/mixed_models) guide to learn
+    more about using different adapter types.
+
+    </Tip>
+
+    Example:
+
+    ```py
+    >>> from peft import get_peft_model
+
+    >>> base_model = ...  # load the base model, e.g. from transformers
+    >>> peft_model = PeftMixedModel.from_pretrained(base_model, path_to_adapter1, "adapter1").eval()
+    >>> peft_model.load_adapter(path_to_adapter2, "adapter2")
+    >>> peft_model.set_adapter(["adapter1", "adapter2"])  # activate both adapters
+    >>> peft_model(data)  # forward pass using both adapters
+    ```
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to be tuned.
+        config (`PeftConfig`):
+            The config of the model to be tuned. The adapter type must be compatible.
+        adapter_name (`str`, `optional`, defaults to `"default"`):
+            The name of the first adapter.
+    """
+
+    def __init__(self, model: nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
+        super().__init__()
+        _check_config_compatible(peft_config)
+        _prepare_model_for_gradient_checkpointing(model)
+        self.modules_to_save = None
+        self.base_model = MixedModel(model, {adapter_name: peft_config}, adapter_name)
+        self.set_modules_to_save(peft_config, adapter_name)
+
+        self.config = getattr(model, "config", {"model_type": "custom"})
+
+        # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid
+        # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected
+        # behavior we disable that in this line.
+        if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"):
+            self.base_model.config.pretraining_tp = 1
+
+    @property
+    def peft_config(self) -> dict[str, PeftConfig]:
+        return self.base_model.peft_config
+
+    @property
+    def active_adapter(self) -> str:
+        return self.base_model.active_adapter
+
+    @property
+    def active_adapters(self) -> list[str]:
+        return self.base_model.active_adapters
+
+    def get_nb_trainable_parameters(self):
+        r"""
+        Returns the number of trainable parameters and number of all parameters in the model.
+        """
+        # note: same as PeftModel.get_nb_trainable_parameters
+        trainable_params = 0
+        all_param = 0
+        for _, param in self.named_parameters():
+            num_params = param.numel()
+            # if using DS Zero 3 and the weights are initialized empty
+            if num_params == 0 and hasattr(param, "ds_numel"):
+                num_params = param.ds_numel
+
+            # Due to the design of 4bit linear layers from bitsandbytes
+            # one needs to multiply the number of parameters by 2 to get
+            # the correct number of parameters
+            if param.__class__.__name__ == "Params4bit":
+                num_params = num_params * 2
+
+            all_param += num_params
+            if param.requires_grad:
+                trainable_params += num_params
+
+        return trainable_params, all_param
+
+    def print_trainable_parameters(self):
+        """
+        Prints the number of trainable parameters in the model.
+        """
+        # note: same as PeftModel.print_trainable_parameters
+        trainable_params, all_param = self.get_nb_trainable_parameters()
+
+        print(
+            f"trainable params: {trainable_params:,d} || "
+            f"all params: {all_param:,d} || "
+            f"trainable%: {100 * trainable_params / all_param:.4f}"
+        )
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.base_model, name)
+
+    def forward(self, *args: Any, **kwargs: Any):
+        """
+        Forward pass of the model.
+        """
+        return self.base_model(*args, **kwargs)
+
+    def generate(self, *args: Any, **kwargs: Any):
+        """
+        Generate output.
+        """
+        return self.base_model.generate(*args, **kwargs)
+
+    @contextmanager
+    def disable_adapter(self):
+        """
+        Disables the adapter module.
+        """
+        try:
+            self.base_model.disable_adapter_layers()
+            yield
+        finally:
+            self.base_model.enable_adapter_layers()
+
+    def add_adapter(self, adapter_name: str, peft_config: PeftConfig):
+        _check_config_compatible(peft_config)
+
+        try:
+            self.peft_config[adapter_name] = peft_config
+            self.base_model.inject_adapter(self, adapter_name)
+        except Exception:  # something went wrong, roll back
+            if adapter_name in self.peft_config:
+                del self.peft_config[adapter_name]
+            raise
+
+        self.set_modules_to_save(peft_config, adapter_name)
+
+    def set_modules_to_save(self, peft_config: PeftConfig, adapter_name: str) -> None:
+        if (modules_to_save := getattr(peft_config, "modules_to_save", None)) is None:
+            return
+
+        if self.modules_to_save is None:
+            self.modules_to_save = set(modules_to_save)
+        else:
+            self.modules_to_save.update(modules_to_save)
+        _set_trainable(self, adapter_name)
+
+    def set_adapter(self, adapter_name: Union[str, list[str]]) -> None:
+        """
+        Sets the active adapter(s) for the model.
+
+        Note that the order in which the adapters are applied during the forward pass may not be the same as the order
+        in which they are passed to this function. Instead, the order during the forward pass is determined by the
+        order in which the adapters were loaded into the model. The active adapters only determine which adapters are
+        active during the forward pass, but not the order in which they are applied.
+
+        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
+        not desired, use the following code.
+
+        ```py
+        >>> for name, param in model_peft.named_parameters():
+        ...     if ...:  # some check on name (ex. if 'lora' in name)
+        ...         param.requires_grad = False
+        ```
+
+        Args:
+            adapter_name (`str` or `List[str]`):
+                The name of the adapter(s) to be activated.
+        """
+        if isinstance(adapter_name, str):
+            adapter_name = [adapter_name]
+
+        mismatched = set(adapter_name) - set(self.peft_config.keys())
+        if mismatched:
+            raise ValueError(
+                f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}"
+            )
+
+        self.base_model.set_adapter(adapter_name)
+        _set_adapter(self, adapter_name)
+
+    def delete_adapter(self, adapter_name: Union[str, list[str]]) -> None:
+        if isinstance(adapter_name, str):
+            adapter_name = [adapter_name]
+
+        mismatched = set(adapter_name) - set(self.peft_config.keys())
+        if mismatched:
+            raise ValueError(
+                f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}"
+            )
+
+        self.base_model.delete_adapter(adapter_name)
+
+    def merge_and_unload(self, *args: Any, **kwargs: Any):
+        r"""
+        This method merges the adapter layers into the base model. This is needed if someone wants to use the base
+        model as a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        return self.base_model.merge_and_unload(*args, **kwargs)
+
+    def unload(self, *args: Any, **kwargs: Any):
+        """
+        Gets back the base model by removing all the adapter modules without merging. This gives back the original base
+        model.
+        """
+        return self.base_model.unload(*args, **kwargs)
+
+    @classmethod
+    def _split_kwargs(cls, kwargs: dict[str, Any]):
+        return PeftModel._split_kwargs(kwargs)
+
+    def load_adapter(self, model_id: str, adapter_name: str, *args: Any, **kwargs: Any):
+        output = PeftModel.load_adapter(self, model_id, adapter_name, *args, **kwargs)
+        # TODO: not quite clear why this is necessary but tests fail without it
+        self.set_adapter(self.active_adapters)
+        return output
+
+    def create_or_update_model_card(self, output_dir: str):
+        raise NotImplementedError(f"Model card creation is not supported for {self.__class__.__name__} (yet).")
+
+    def save_pretrained(
+        self,
+        save_directory: str,
+        safe_serialization: bool = False,
+        selected_adapters: Optional[list[str]] = None,
+        **kwargs: Any,
+    ):
+        raise NotImplementedError(f"Saving is not supported for {self.__class__.__name__} (yet).")
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model: nn.Module,
+        model_id: str | os.PathLike,
+        adapter_name: str = "default",
+        is_trainable: bool = False,
+        config: Optional[PeftConfig] = None,
+        **kwargs: Any,
+    ):
+        r"""
+        Instantiate a PEFT mixed model from a pretrained model and loaded PEFT weights.
+
+        Note that the passed `model` may be modified inplace.
+
+        Args:
+            model (`nn.Module`):
+                The model to be adapted.
+            model_id (`str` or `os.PathLike`):
+                The name of the PEFT configuration to use. Can be either:
+                    - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face
+                      Hub.
+                    - A path to a directory containing a PEFT configuration file saved using the `save_pretrained`
+                      method (`./my_peft_config_directory/`).
+            adapter_name (`str`, *optional*, defaults to `"default"`):
+                The name of the adapter to be loaded. This is useful for loading multiple adapters.
+            is_trainable (`bool`, *optional*, defaults to `False`):
+                Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and use for
+                inference
+            config ([`~peft.PeftConfig`], *optional*):
+                The configuration object to use instead of an automatically loaded configuration. This configuration
+                object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already
+                loaded before calling `from_pretrained`.
+            kwargs: (`optional`):
+                Additional keyword arguments passed along to the specific PEFT configuration class.
+        """
+        # note: adapted from PeftModel.from_pretrained
+        from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING
+
+        # load the config
+        if config is None:
+            config = PEFT_TYPE_TO_CONFIG_MAPPING[
+                PeftConfig._get_peft_type(
+                    model_id,
+                    subfolder=kwargs.get("subfolder", None),
+                    revision=kwargs.get("revision", None),
+                    cache_dir=kwargs.get("cache_dir", None),
+                    use_auth_token=kwargs.get("use_auth_token", None),
+                )
+            ].from_pretrained(model_id, **kwargs)
+        elif isinstance(config, PeftConfig):
+            config.inference_mode = not is_trainable
+        else:
+            raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}")
+
+        # note: this is different from PeftModel.from_pretrained
+        if config.peft_type not in PEFT_TYPE_TO_MODEL_MAPPING:
+            raise ValueError(f"Adapter of type {config.peft_type} is not supported for mixed models.")
+
+        if (getattr(model, "hf_device_map", None) is not None) and len(
+            set(model.hf_device_map.values()).intersection({"cpu", "disk"})
+        ) > 0:
+            remove_hook_from_submodules(model)
+
+        if config.is_prompt_learning and is_trainable:
+            # note: should not be possible to reach, but just in case
+            raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.")
+        else:
+            config.inference_mode = not is_trainable
+
+        # note: this is different from PeftModel.from_pretrained, we always return a PeftMixedModel
+        model = cls(model, config, adapter_name)
+        model.load_adapter(model_id, adapter_name, is_trainable=is_trainable, **kwargs)
+        return model
diff --git a/MoRA/peft_mora/peft_model.py b/MoRA/peft_mora/peft_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..88bfb070e921d5e1def04d4f8a7eaa99f290d617
--- /dev/null
+++ b/MoRA/peft_mora/peft_model.py
@@ -0,0 +1,1929 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import collections
+import inspect
+import os
+import warnings
+from contextlib import contextmanager
+from copy import deepcopy
+from typing import Any, Optional, Union
+
+import packaging.version
+import torch
+import transformers
+from accelerate import dispatch_model, infer_auto_device_map
+from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules
+from accelerate.utils import get_balanced_memory
+from huggingface_hub import ModelCard, ModelCardData, hf_hub_download
+from safetensors.torch import save_file as safe_save_file
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput
+from transformers.utils import PushToHubMixin
+
+from . import __version__
+from .config import PeftConfig
+from .tuners import (
+    AdaLoraModel,
+    AdaptionPromptModel,
+    IA3Model,
+    LoHaModel,
+    LoKrModel,
+    LoraModel,
+    MultitaskPromptEmbedding,
+    OFTModel,
+    PolyModel,
+    PrefixEncoder,
+    PromptEmbedding,
+    PromptEncoder,
+)
+from .utils import (
+    SAFETENSORS_WEIGHTS_NAME,
+    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
+    WEIGHTS_NAME,
+    PeftType,
+    TaskType,
+    _get_batch_size,
+    _prepare_prompt_learning_config,
+    _set_adapter,
+    _set_trainable,
+    get_peft_model_state_dict,
+    id_tensor_storage,
+    infer_device,
+    load_peft_weights,
+    set_peft_model_state_dict,
+    shift_tokens_right,
+)
+
+
+PEFT_TYPE_TO_MODEL_MAPPING = {
+    PeftType.LORA: LoraModel,
+    PeftType.LOHA: LoHaModel,
+    PeftType.LOKR: LoKrModel,
+    PeftType.PROMPT_TUNING: PromptEmbedding,
+    PeftType.P_TUNING: PromptEncoder,
+    PeftType.PREFIX_TUNING: PrefixEncoder,
+    PeftType.ADALORA: AdaLoraModel,
+    PeftType.ADAPTION_PROMPT: AdaptionPromptModel,
+    PeftType.IA3: IA3Model,
+    PeftType.OFT: OFTModel,
+    PeftType.POLY: PolyModel,
+}
+
+
+class PeftModel(PushToHubMixin, torch.nn.Module):
+    """
+    Base model encompassing various Peft methods.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): The base transformer model used for Peft.
+        peft_config ([`PeftConfig`]): The configuration of the Peft model.
+        adapter_name (`str`,  *optional*): The name of the adapter, defaults to `"default"`.
+
+    **Attributes**:
+        - **base_model** ([`torch.nn.Module`]) -- The base transformer model used for Peft.
+        - **peft_config** ([`PeftConfig`]) -- The configuration of the Peft model.
+        - **modules_to_save** (`list` of `str`) -- The list of sub-module names to save when
+            saving the model.
+        - **prompt_encoder** ([`PromptEncoder`]) -- The prompt encoder used for Peft if
+            using [`PromptLearningConfig`].
+        - **prompt_tokens** (`torch.Tensor`) -- The virtual prompt tokens used for Peft if
+            using [`PromptLearningConfig`].
+        - **transformer_backbone_name** (`str`) -- The name of the transformer
+            backbone in the base model if using [`PromptLearningConfig`].
+        - **word_embeddings** (`torch.nn.Embedding`) -- The word embeddings of the transformer backbone
+            in the base model if using [`PromptLearningConfig`].
+    """
+
+    def __init__(self, model: PreTrainedModel, peft_config: PeftConfig, adapter_name: str = "default") -> None:
+        super().__init__()
+        self.modules_to_save = None
+        self.active_adapter = adapter_name
+        self.peft_type = peft_config.peft_type
+
+        self._is_prompt_learning = peft_config.is_prompt_learning
+        if self._is_prompt_learning:
+            self._peft_config = {adapter_name: peft_config}
+            self.base_model = model
+            self.add_adapter(adapter_name, peft_config)
+        else:
+            self._peft_config = None
+            cls = PEFT_TYPE_TO_MODEL_MAPPING[peft_config.peft_type]
+            self.base_model = cls(model, {adapter_name: peft_config}, adapter_name)
+            self.set_additional_trainable_modules(peft_config, adapter_name)
+
+        if getattr(model, "is_gradient_checkpointing", True):
+            model = self._prepare_model_for_gradient_checkpointing(model)
+
+        # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid
+        # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected
+        # behavior we disable that in this line.
+        if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"):
+            self.base_model.config.pretraining_tp = 1
+
+    @property
+    def peft_config(self) -> dict[str, PeftConfig]:
+        if self._is_prompt_learning:
+            return self._peft_config
+        return self.base_model.peft_config
+
+    @property
+    def active_adapters(self) -> list[str]:
+        try:
+            adapters = self.base_model.active_adapters
+        except AttributeError:
+            adapters = self.active_adapter
+            if isinstance(adapters, str):
+                adapters = [adapters]
+        return adapters
+
+    @peft_config.setter
+    def peft_config(self, value: dict[str, PeftConfig]):
+        if self._is_prompt_learning:
+            self._peft_config = value
+        else:
+            self.base_model.peft_config = value
+
+    def save_pretrained(
+        self,
+        save_directory: str,
+        safe_serialization: bool = True,
+        selected_adapters: Optional[list[str]] = None,
+        save_embedding_layers: Union[str, bool] = "auto",
+        is_main_process: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        r"""
+        This function saves the adapter model and the adapter configuration files to a directory, so that it can be
+        reloaded using the [`PeftModel.from_pretrained`] class method, and also used by the [`PeftModel.push_to_hub`]
+        method.
+
+        Args:
+            save_directory (`str`):
+                Directory where the adapter model and configuration files will be saved (will be created if it does not
+                exist).
+            safe_serialization (`bool`, *optional*):
+                Whether to save the adapter files in safetensors format, defaults to `True`.
+            selected_adapters (`List[str]`,  *optional*):
+                A list of adapters to be saved. If `None`, will default to all adapters.
+            save_embedding_layers (`Union[bool, str]`, *optional*, defaults to `"auto"`):
+                If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common
+                embedding layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available.
+                and automatically sets the boolean flag. This only works for 🤗 transformers models.
+            is_main_process (`bool`, *optional*):
+                Whether the process calling this is the main process or not. Will default to `True`. Will not save the
+                checkpoint if not on the main process, which is important for multi device setups (e.g. DDP).
+            kwargs (additional keyword arguments, *optional*):
+                Additional keyword arguments passed along to the `push_to_hub` method.
+        """
+        if os.path.isfile(save_directory):
+            raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        if selected_adapters is None:
+            selected_adapters = list(self.peft_config.keys())
+        else:
+            if any(
+                selected_adapter_name not in list(self.peft_config.keys())
+                for selected_adapter_name in selected_adapters
+            ):
+                raise ValueError(
+                    f"You passed an invalid `selected_adapters` arguments, current supported adapter names are"
+                    f" {list(self.peft_config.keys())} - got {selected_adapters}."
+                )
+
+        if is_main_process:
+            os.makedirs(save_directory, exist_ok=True)
+            self.create_or_update_model_card(save_directory)
+
+        for adapter_name in selected_adapters:
+            peft_config = self.peft_config[adapter_name]
+            # save only the trainable weights
+            output_state_dict = get_peft_model_state_dict(
+                self,
+                state_dict=kwargs.get("state_dict", None),
+                adapter_name=adapter_name,
+                save_embedding_layers=save_embedding_layers,
+            )
+            output_dir = os.path.join(save_directory, adapter_name) if adapter_name != "default" else save_directory
+            os.makedirs(output_dir, exist_ok=True)
+
+            if is_main_process and safe_serialization:
+                # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134
+                # Safetensors does not allow tensor aliasing.
+                # We're going to remove aliases before saving
+                ptrs = collections.defaultdict(list)
+                for name, tensor in output_state_dict.items():
+                    # Sometimes in the state_dict we have non-tensor objects.
+                    # e.g. in bitsandbytes we have some `str` objects in the state_dict
+                    if isinstance(tensor, torch.Tensor):
+                        ptrs[id_tensor_storage(tensor)].append(name)
+                    else:
+                        # In the non-tensor case, fall back to the pointer of the object itself
+                        ptrs[id(tensor)].append(name)
+
+                # These are all the pointers of shared tensors.
+                shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1}
+
+                for _, names in shared_ptrs.items():
+                    # Here we just clone the shared tensors to avoid tensor aliasing which is
+                    # not supported in safetensors.
+                    for shared_tensor_name in names[1:]:
+                        output_state_dict[shared_tensor_name] = output_state_dict[shared_tensor_name].clone()
+
+                safe_save_file(
+                    output_state_dict,
+                    os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME),
+                    metadata={"format": "pt"},
+                )
+            elif is_main_process:
+                torch.save(output_state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+
+            # save the config and change the inference mode to `True`
+            if peft_config.base_model_name_or_path is None:
+                peft_config.base_model_name_or_path = (
+                    self.base_model.__dict__.get("name_or_path", None)
+                    if peft_config.is_prompt_learning
+                    else self.base_model.model.__dict__.get("name_or_path", None)
+                )
+            inference_mode = peft_config.inference_mode
+            peft_config.inference_mode = True
+
+            if peft_config.task_type is None:
+                # deal with auto mapping
+                base_model_class = self._get_base_model_class(
+                    is_prompt_tuning=peft_config.is_prompt_learning,
+                )
+                parent_library = base_model_class.__module__
+
+                auto_mapping_dict = {
+                    "base_model_class": base_model_class.__name__,
+                    "parent_library": parent_library,
+                }
+            else:
+                auto_mapping_dict = None
+
+            if is_main_process:
+                peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict)
+            peft_config.inference_mode = inference_mode
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model: torch.nn.Module,
+        model_id: Union[str, os.PathLike],
+        adapter_name: str = "default",
+        is_trainable: bool = False,
+        config: Optional[PeftConfig] = None,
+        **kwargs: Any,
+    ) -> PeftModel:
+        r"""
+        Instantiate a PEFT model from a pretrained model and loaded PEFT weights.
+
+        Note that the passed `model` may be modified inplace.
+
+        Args:
+            model ([`torch.nn.Module`]):
+                The model to be adapted. For 🤗 Transformers models, the model should be initialized with the
+                [`~transformers.PreTrainedModel.from_pretrained`].
+            model_id (`str` or `os.PathLike`):
+                The name of the PEFT configuration to use. Can be either:
+                    - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face
+                      Hub.
+                    - A path to a directory containing a PEFT configuration file saved using the `save_pretrained`
+                      method (`./my_peft_config_directory/`).
+            adapter_name (`str`, *optional*, defaults to `"default"`):
+                The name of the adapter to be loaded. This is useful for loading multiple adapters.
+            is_trainable (`bool`, *optional*, defaults to `False`):
+                Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be
+                used for inference.
+            config ([`~peft.PeftConfig`], *optional*):
+                The configuration object to use instead of an automatically loaded configuration. This configuration
+                object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already
+                loaded before calling `from_pretrained`.
+            kwargs: (`optional`):
+                Additional keyword arguments passed along to the specific PEFT configuration class.
+        """
+        from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING
+
+        # load the config
+        if config is None:
+            config = PEFT_TYPE_TO_CONFIG_MAPPING[
+                PeftConfig._get_peft_type(
+                    model_id,
+                    subfolder=kwargs.get("subfolder", None),
+                    revision=kwargs.get("revision", None),
+                    cache_dir=kwargs.get("cache_dir", None),
+                    use_auth_token=kwargs.get("use_auth_token", None),
+                    token=kwargs.get("token", None),
+                )
+            ].from_pretrained(model_id, **kwargs)
+        elif isinstance(config, PeftConfig):
+            config.inference_mode = not is_trainable
+        else:
+            raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}")
+
+        if (getattr(model, "hf_device_map", None) is not None) and len(
+            set(model.hf_device_map.values()).intersection({"cpu", "disk"})
+        ) > 0:
+            remove_hook_from_submodules(model)
+
+        if config.is_prompt_learning and is_trainable:
+            raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.")
+        else:
+            config.inference_mode = not is_trainable
+
+        if config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys():
+            model = cls(model, config, adapter_name)
+        else:
+            model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type](model, config, adapter_name)
+        model.load_adapter(model_id, adapter_name, is_trainable=is_trainable, **kwargs)
+        return model
+
+    def _setup_prompt_encoder(self, adapter_name: str):
+        config = self.peft_config[adapter_name]
+        if not hasattr(self, "prompt_encoder"):
+            self.prompt_encoder = torch.nn.ModuleDict({})
+            self.prompt_tokens = {}
+        transformer_backbone = None
+        for name, module in self.base_model.named_children():
+            for param in module.parameters():
+                param.requires_grad = False
+            if isinstance(module, PreTrainedModel):
+                # Make sure to freeze Tranformers model
+                if transformer_backbone is None:
+                    transformer_backbone = module
+                    self.transformer_backbone_name = name
+        if transformer_backbone is None:
+            transformer_backbone = self.base_model
+
+        if config.num_transformer_submodules is None:
+            config.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1
+
+        for named_param, value in list(transformer_backbone.named_parameters()):
+            # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape [0]
+            # the actual unsharded shape is stored in "ds_shape" attribute
+            # special handling is needed in case the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig
+            # has been called before
+            # For reference refer to issue: https://github.com/huggingface/peft/issues/996
+            deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None)
+
+            if value.shape[0] == self.base_model.config.vocab_size or (
+                deepspeed_distributed_tensor_shape is not None
+                and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size
+            ):
+                self.word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", ""))
+                break
+
+        if config.peft_type == PeftType.PROMPT_TUNING:
+            prompt_encoder = PromptEmbedding(config, self.word_embeddings)
+        elif config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+            prompt_encoder = MultitaskPromptEmbedding(config, self.word_embeddings)
+        elif config.peft_type == PeftType.P_TUNING:
+            prompt_encoder = PromptEncoder(config)
+        elif config.peft_type == PeftType.PREFIX_TUNING:
+            prompt_encoder = PrefixEncoder(config)
+        else:
+            raise ValueError("Not supported")
+
+        prompt_encoder = prompt_encoder.to(self.device)
+        self.prompt_encoder.update(torch.nn.ModuleDict({adapter_name: prompt_encoder}))
+        self.prompt_tokens[adapter_name] = torch.arange(
+            config.num_virtual_tokens * config.num_transformer_submodules
+        ).long()
+
+    def _prepare_model_for_gradient_checkpointing(self, model: PreTrainedModel):
+        r"""
+        Prepares the model for gradient checkpointing if necessary
+        """
+        if not (
+            getattr(model, "is_loaded_in_8bit", False)
+            or getattr(model, "is_loaded_in_4bit", False)
+            or getattr(model, "is_quantized", False)
+        ):
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            elif hasattr(model, "get_input_embeddings"):
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+        return model
+
+    def get_prompt_embedding_to_save(self, adapter_name: str) -> torch.Tensor:
+        """
+        Returns the prompt embedding to save when saving the model. Only applicable when using a prompt learning
+        method.
+        """
+        prompt_encoder = self.prompt_encoder[adapter_name]
+        prompt_tokens = (
+            self.prompt_tokens[adapter_name].unsqueeze(0).expand(1, -1).to(prompt_encoder.embedding.weight.device)
+        )
+        if self.peft_config[adapter_name].peft_type == PeftType.PREFIX_TUNING:
+            prompt_tokens = prompt_tokens[:, : self.peft_config[adapter_name].num_virtual_tokens]
+
+        if self.peft_config[adapter_name].peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+            prompt_embeddings = super(MultitaskPromptEmbedding, prompt_encoder).forward(prompt_tokens)
+        else:
+            prompt_embeddings = prompt_encoder(prompt_tokens)
+
+        return prompt_embeddings[0].detach().cpu()
+
+    def get_prompt(self, batch_size: int, task_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Returns the virtual prompts to use for Peft. Only applicable when using a prompt learning method.
+        """
+        peft_config = self.active_peft_config
+        prompt_encoder = self.prompt_encoder[self.active_adapter]
+        prompt_tokens = (
+            self.prompt_tokens[self.active_adapter]
+            .unsqueeze(0)
+            .expand(batch_size, -1)
+            .to(prompt_encoder.embedding.weight.device)
+        )
+        if peft_config.peft_type == PeftType.PREFIX_TUNING:
+            prompt_tokens = prompt_tokens[:, : peft_config.num_virtual_tokens]
+            if peft_config.inference_mode:
+                past_key_values = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1)
+            else:
+                past_key_values = prompt_encoder(prompt_tokens)
+            if self.base_model_torch_dtype is not None:
+                past_key_values = past_key_values.to(self.base_model_torch_dtype)
+            past_key_values = past_key_values.view(
+                batch_size,
+                peft_config.num_virtual_tokens,
+                peft_config.num_layers * 2,
+                peft_config.num_attention_heads,
+                peft_config.token_dim // peft_config.num_attention_heads,
+            )
+            if peft_config.num_transformer_submodules == 2:
+                past_key_values = torch.cat([past_key_values, past_key_values], dim=2)
+            past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(
+                peft_config.num_transformer_submodules * 2
+            )
+            if TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING.get(self.config.model_type, None) is not None:
+                post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type]
+                past_key_values = post_process_fn(past_key_values)
+            return past_key_values
+        else:
+            if peft_config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+                prompts = prompt_encoder(prompt_tokens, task_ids)
+            else:
+                if peft_config.inference_mode:
+                    prompts = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1)
+                else:
+                    prompts = prompt_encoder(prompt_tokens)
+            return prompts
+
+    def get_nb_trainable_parameters(self) -> tuple[int, int]:
+        r"""
+        Returns the number of trainable parameters and the number of all parameters in the model.
+        """
+        trainable_params = 0
+        all_param = 0
+        for _, param in self.named_parameters():
+            num_params = param.numel()
+            # if using DS Zero 3 and the weights are initialized empty
+            if num_params == 0 and hasattr(param, "ds_numel"):
+                num_params = param.ds_numel
+
+            # Due to the design of 4bit linear layers from bitsandbytes
+            # one needs to multiply the number of parameters by 2 to get
+            # the correct number of parameters
+            if param.__class__.__name__ == "Params4bit":
+                num_params = num_params * 2
+
+            all_param += num_params
+            if param.requires_grad:
+                trainable_params += num_params
+
+        return trainable_params, all_param
+
+    def print_trainable_parameters(self) -> None:
+        """
+        Prints the number of trainable parameters in the model.
+        """
+        trainable_params, all_param = self.get_nb_trainable_parameters()
+
+        print(
+            f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param}"
+        )
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.base_model, name)
+
+    def forward(self, *args: Any, **kwargs: Any):
+        """
+        Forward pass of the model.
+        """
+        return self.get_base_model()(*args, **kwargs)
+
+    def _get_base_model_class(self, is_prompt_tuning=False):
+        """
+        Returns the base model class.
+        """
+        if not is_prompt_tuning:
+            return self.base_model.model.__class__
+        return self.base_model.__class__
+
+    @contextmanager
+    def disable_adapter(self):
+        """
+        Context manager that disables the adapter module. Use this to run inference on the base model.
+
+        Example:
+
+        ```py
+        >>> with model.disable_adapter():
+        ...     model(inputs)
+        ```
+        """
+        try:
+            if self.peft_config[self.active_adapter].is_prompt_learning:
+                # TODO: consider replacing this patching of methods with a more robust mechanism: setting a flag and
+                # letting the underlying methods deal with it, same as how LoRA does it.
+                old_forward = self.forward
+                self.forward = self.base_model.forward
+                old_prepare_inputs_for_generation = self.prepare_inputs_for_generation
+                self.prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
+            else:
+                self.base_model.disable_adapter_layers()
+            yield
+        finally:
+            if self.peft_config[self.active_adapter].is_prompt_learning:
+                self.forward = old_forward
+                self.prepare_inputs_for_generation = old_prepare_inputs_for_generation
+            else:
+                self.base_model.enable_adapter_layers()
+
+    def get_base_model(self) -> torch.nn.Module:
+        """
+        Returns the base model.
+        """
+        return (
+            self.base_model
+            if (self.active_peft_config.is_prompt_learning or self.peft_type == PeftType.POLY)
+            else self.base_model.model
+        )
+
+    def add_adapter(self, adapter_name: str, peft_config: PeftConfig) -> None:
+        """
+        Add an adapter to the model based on the passed configuration.
+
+        The name for the new adapter should be unique.
+
+        The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active
+        adapter.
+
+        Args:
+            adapter_name (`str`):
+                The name of the adapter to be added.
+            peft_config ([`PeftConfig`]):
+                The configuration of the adapter to be added.
+        """
+        if peft_config.peft_type != self.peft_type:
+            raise ValueError(
+                f"Cannot combine adapters with different peft types. "
+                f"Found {self.peft_type} and {peft_config.peft_type}."
+            )
+
+        try:
+            if peft_config.is_prompt_learning:
+                self.peft_config[adapter_name] = peft_config
+                if hasattr(self.config, "to_dict"):
+                    dict_config = self.config.to_dict()
+                else:
+                    dict_config = self.config
+
+                peft_config = _prepare_prompt_learning_config(peft_config, dict_config)
+                self._setup_prompt_encoder(adapter_name)
+            elif peft_config.is_adaption_prompt:
+                self.base_model.add_adapter(adapter_name, peft_config)
+            else:
+                self.peft_config[adapter_name] = peft_config
+                self.base_model.inject_adapter(self.base_model.model, adapter_name)
+        except Exception:  # something went wrong, roll back
+            if adapter_name in self.peft_config:
+                del self.peft_config[adapter_name]
+            raise
+
+        self.set_additional_trainable_modules(peft_config, adapter_name)
+
+    def set_additional_trainable_modules(self, peft_config, adapter_name):
+        if getattr(peft_config, "modules_to_save", None) is not None:
+            if self.modules_to_save is None:
+                self.modules_to_save = set(peft_config.modules_to_save)
+            else:
+                self.modules_to_save.update(peft_config.modules_to_save)
+            _set_trainable(self, adapter_name)
+
+    @classmethod
+    def _split_kwargs(cls, kwargs: dict[str, Any]):
+        _kwargs_not_in_hf_hub_download_signature = ("use_auth_token",)
+        hf_hub_download_kwargs = {}
+        other_kwargs = {}
+
+        for key, value in kwargs.items():
+            if key in inspect.signature(hf_hub_download).parameters or key in _kwargs_not_in_hf_hub_download_signature:
+                hf_hub_download_kwargs[key] = value
+            else:
+                other_kwargs[key] = value
+
+        return hf_hub_download_kwargs, other_kwargs
+
+    def load_adapter(self, model_id: str, adapter_name: str, is_trainable: bool = False, **kwargs: Any):
+        """
+        Load a trained adapter into the model.
+
+        The name for the new adapter should be unique.
+
+        The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active
+        adapter.
+
+        Args:
+            adapter_name (`str`):
+                The name of the adapter to be added.
+            peft_config ([`PeftConfig`]):
+                The configuration of the adapter to be added.
+            is_trainable (`bool`, *optional*, defaults to `False`):
+                Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be
+                used for inference.
+            kwargs: (`optional`):
+                Additional arguments to modify the way the adapter is loaded, e.g. the token for Hugging Face Hub.
+        """
+        from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING
+
+        hf_hub_download_kwargs, kwargs = self._split_kwargs(kwargs)
+        torch_device = infer_device()
+
+        if adapter_name not in self.peft_config:
+            # load the config
+            peft_config = PEFT_TYPE_TO_CONFIG_MAPPING[
+                PeftConfig._get_peft_type(
+                    model_id,
+                    **hf_hub_download_kwargs,
+                )
+            ].from_pretrained(
+                model_id,
+                **hf_hub_download_kwargs,
+            )
+            if peft_config.is_prompt_learning and is_trainable:
+                raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.")
+            else:
+                peft_config.inference_mode = not is_trainable
+            self.add_adapter(adapter_name, peft_config)
+
+        adapters_weights = load_peft_weights(model_id, device=torch_device, **hf_hub_download_kwargs)
+
+        # load the weights into the model
+        load_result = set_peft_model_state_dict(self, adapters_weights, adapter_name=adapter_name)
+        if (
+            (getattr(self, "hf_device_map", None) is not None)
+            and (len(set(self.hf_device_map.values()).intersection({"cpu", "disk"})) > 0)
+            and len(self.peft_config) == 1
+        ):
+            device_map = kwargs.get("device_map", "auto")
+            max_memory = kwargs.get("max_memory", None)
+            offload_dir = kwargs.get("offload_folder", None)
+            offload_index = kwargs.get("offload_index", None)
+
+            dispatch_model_kwargs = {}
+            # Safety checker for previous `accelerate` versions
+            # `offload_index` was introduced in https://github.com/huggingface/accelerate/pull/873/
+            if "offload_index" in inspect.signature(dispatch_model).parameters:
+                dispatch_model_kwargs["offload_index"] = offload_index
+
+            no_split_module_classes = self._no_split_modules
+
+            if device_map != "sequential":
+                max_memory = get_balanced_memory(
+                    self,
+                    max_memory=max_memory,
+                    no_split_module_classes=no_split_module_classes,
+                    low_zero=(device_map == "balanced_low_0"),
+                )
+            if isinstance(device_map, str):
+                device_map = infer_auto_device_map(
+                    self, max_memory=max_memory, no_split_module_classes=no_split_module_classes
+                )
+            dispatch_model(
+                self,
+                device_map=device_map,
+                offload_dir=offload_dir,
+                **dispatch_model_kwargs,
+            )
+            hook = AlignDevicesHook(io_same_device=True)
+            if self.peft_config[adapter_name].is_prompt_learning:
+                remove_hook_from_submodules(self.prompt_encoder)
+            add_hook_to_module(self.get_base_model(), hook)
+
+        # Set model in evaluation mode to deactivate Dropout modules by default
+        if not is_trainable:
+            self.eval()
+        return load_result
+
+    def set_adapter(self, adapter_name: str) -> None:
+        """
+        Sets the active adapter.
+
+        Only one adapter can be active at a time.
+
+        Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is
+        not desired, use the following code.
+
+        ```py
+        >>> for name, param in model_peft.named_parameters():
+        ...     if ...:  # some check on name (ex. if 'lora' in name)
+        ...         param.requires_grad = False
+        ```
+
+        Args:
+            adapter_name (`str`):
+                The name of the adapter to be set as active. The adapter must be loaded first.
+        """
+        if adapter_name not in self.peft_config:
+            raise ValueError(f"Adapter {adapter_name} not found.")
+        self.active_adapter = adapter_name
+        if not self.peft_config[adapter_name].is_prompt_learning:
+            self.base_model.set_adapter(adapter_name)
+        _set_adapter(self, adapter_name)
+
+    @property
+    def base_model_torch_dtype(self):
+        return getattr(self.base_model, "dtype", None)
+
+    @property
+    def active_peft_config(self):
+        return self.peft_config[self.active_adapter]
+
+    def create_or_update_model_card(self, output_dir: str):
+        """
+        Updates or create model card to include information about peft:
+        1. Adds `peft` library tag
+        2. Adds peft version
+        3. Adds base model info
+        4. Adds quantization information if it was used
+        """
+
+        filename = os.path.join(output_dir, "README.md")
+
+        card = ModelCard.load(filename) if os.path.exists(filename) else ModelCard.from_template(ModelCardData())
+
+        card.data["library_name"] = "peft"
+
+        model_config = getattr(self, "config", None)
+        if hasattr(model_config, "to_dict"):
+            model_config = model_config.to_dict()
+        if model_config is not None and "_name_or_path" in model_config:
+            card.data["base_model"] = model_config["_name_or_path"]
+
+        lines = card.text.splitlines()
+
+        quantization_config = None
+        if hasattr(model_config, "quantization_config"):
+            quantization_config = self.config.quantization_config.to_dict()
+        training_config_text = ""
+        quantization_prefix = "The following `bitsandbytes` quantization config was used during training:"
+        # Adds quantization information if it was used
+        if quantization_config is not None:
+            training_config_text += f"\n{quantization_prefix}\n"
+            training_config_text += "\n".join([f"- {name}: {value}" for name, value in quantization_config.items()])
+            training_config_text += "\n"
+
+        training_procedure_heading = "## Training procedure"
+        if quantization_prefix not in lines and bool(training_config_text):
+            if training_procedure_heading in lines:
+                lines.insert(lines.index(training_procedure_heading) + 2, training_config_text)
+            else:
+                lines.append(f"{training_procedure_heading}\n{training_config_text}")
+
+        # Adds peft version
+        framework_block_heading = "### Framework versions"
+        if f"- PEFT {__version__}" not in lines:
+            if framework_block_heading in lines:
+                lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}")
+            else:
+                lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}")
+
+        card.text = "\n".join(lines)
+        card.save(filename)
+
+
+class PeftModelForSequenceClassification(PeftModel):
+    """
+    Peft model for sequence classification tasks.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): Base transformer model.
+        peft_config ([`PeftConfig`]): Peft config.
+
+    **Attributes**:
+        - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model.
+        - **cls_layer_name** (`str`) -- The name of the classification layer.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForSequenceClassification
+        >>> from peft import PeftModelForSequenceClassification, get_peft_config
+
+        >>> config = {
+        ...     "peft_type": "PREFIX_TUNING",
+        ...     "task_type": "SEQ_CLS",
+        ...     "inference_mode": False,
+        ...     "num_virtual_tokens": 20,
+        ...     "token_dim": 768,
+        ...     "num_transformer_submodules": 1,
+        ...     "num_attention_heads": 12,
+        ...     "num_layers": 12,
+        ...     "encoder_hidden_size": 768,
+        ...     "prefix_projection": False,
+        ...     "postprocess_past_key_value_function": None,
+        ... }
+
+        >>> peft_config = get_peft_config(config)
+        >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+        >>> peft_model = PeftModelForSequenceClassification(model, peft_config)
+        >>> peft_model.print_trainable_parameters()
+        trainable params: 370178 || all params: 108680450 || trainable%: 0.3406113979101117
+        ```
+    """
+
+    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
+        super().__init__(model, peft_config, adapter_name)
+        if self.modules_to_save is None:
+            self.modules_to_save = {"classifier", "score"}
+        else:
+            self.modules_to_save.update({"classifier", "score"})
+
+        for name, _ in self.base_model.named_children():
+            if any(module_name in name for module_name in self.modules_to_save):
+                self.cls_layer_name = name
+                break
+
+        # to make sure classifier layer is trainable
+        _set_trainable(self, adapter_name)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        task_ids=None,
+        **kwargs,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        peft_config = self.active_peft_config
+        if not peft_config.is_prompt_learning:
+            if peft_config.peft_type == PeftType.POLY:
+                kwargs["task_ids"] = task_ids
+            return self.base_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs,
+            )
+
+        batch_size = _get_batch_size(input_ids, inputs_embeds)
+        if attention_mask is not None:
+            # concat prompt attention mask
+            prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device)
+            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
+        if kwargs.get("position_ids", None) is not None:
+            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
+            kwargs["position_ids"] = None
+        kwargs.update(
+            {
+                "attention_mask": attention_mask,
+                "labels": labels,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+            }
+        )
+
+        if peft_config.peft_type == PeftType.PREFIX_TUNING:
+            return self._prefix_tuning_forward(input_ids=input_ids, **kwargs)
+        else:
+            if kwargs.get("token_type_ids", None) is not None:
+                kwargs["token_type_ids"] = torch.cat(
+                    (
+                        torch.zeros(batch_size, peft_config.num_virtual_tokens).to(self.word_embeddings.weight.device),
+                        kwargs["token_type_ids"],
+                    ),
+                    dim=1,
+                ).long()
+            if inputs_embeds is None:
+                inputs_embeds = self.word_embeddings(input_ids)
+            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
+            prompts = prompts.to(inputs_embeds.dtype)
+            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
+            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
+
+    def _prefix_tuning_forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        batch_size = _get_batch_size(input_ids, inputs_embeds)
+        past_key_values = self.get_prompt(batch_size)
+        fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys())
+        kwargs.update(
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "inputs_embeds": inputs_embeds,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+                "past_key_values": past_key_values,
+            }
+        )
+        if "past_key_values" in fwd_params:
+            return self.base_model(labels=labels, **kwargs)
+        else:
+            transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name)
+            fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys())
+            if "past_key_values" not in fwd_params:
+                raise ValueError("Model does not support past key values which are required for prefix tuning.")
+            outputs = transformer_backbone_name(**kwargs)
+            pooled_output = outputs[1] if len(outputs) > 1 else outputs[0]
+            if "dropout" in [name for name, _ in list(self.base_model.named_children())]:
+                pooled_output = self.base_model.dropout(pooled_output)
+            logits = self.base_model.get_submodule(self.cls_layer_name)(pooled_output)
+
+            loss = None
+            if labels is not None:
+                if self.config.problem_type is None:
+                    if self.base_model.num_labels == 1:
+                        self.config.problem_type = "regression"
+                    elif self.base_model.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                        self.config.problem_type = "single_label_classification"
+                    else:
+                        self.config.problem_type = "multi_label_classification"
+
+                if self.config.problem_type == "regression":
+                    loss_fct = MSELoss()
+                    if self.base_model.num_labels == 1:
+                        loss = loss_fct(logits.squeeze(), labels.squeeze())
+                    else:
+                        loss = loss_fct(logits, labels)
+                elif self.config.problem_type == "single_label_classification":
+                    loss_fct = CrossEntropyLoss()
+                    loss = loss_fct(logits.view(-1, self.base_model.num_labels), labels.view(-1))
+                elif self.config.problem_type == "multi_label_classification":
+                    loss_fct = BCEWithLogitsLoss()
+                    loss = loss_fct(logits, labels)
+            if not return_dict:
+                output = (logits,) + outputs[2:]
+                return ((loss,) + output) if loss is not None else output
+
+            return SequenceClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+
+
+class PeftModelForCausalLM(PeftModel):
+    """
+    Peft model for causal language modeling.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): Base transformer model.
+        peft_config ([`PeftConfig`]): Peft config.
+
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForCausalLM
+        >>> from peft import PeftModelForCausalLM, get_peft_config
+
+        >>> config = {
+        ...     "peft_type": "PREFIX_TUNING",
+        ...     "task_type": "CAUSAL_LM",
+        ...     "inference_mode": False,
+        ...     "num_virtual_tokens": 20,
+        ...     "token_dim": 1280,
+        ...     "num_transformer_submodules": 1,
+        ...     "num_attention_heads": 20,
+        ...     "num_layers": 36,
+        ...     "encoder_hidden_size": 1280,
+        ...     "prefix_projection": False,
+        ...     "postprocess_past_key_value_function": None,
+        ... }
+
+        >>> peft_config = get_peft_config(config)
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2-large")
+        >>> peft_model = PeftModelForCausalLM(model, peft_config)
+        >>> peft_model.print_trainable_parameters()
+        trainable params: 1843200 || all params: 775873280 || trainable%: 0.23756456724479544
+        ```
+    """
+
+    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
+        super().__init__(model, peft_config, adapter_name)
+        self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        task_ids=None,
+        **kwargs,
+    ):
+        peft_config = self.active_peft_config
+        if not peft_config.is_prompt_learning:
+            if self.base_model.config.model_type == "mpt":
+                if inputs_embeds is not None:
+                    raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds")
+                return self.base_model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    labels=labels,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                    **kwargs,
+                )
+
+            if peft_config.peft_type == PeftType.POLY:
+                kwargs["task_ids"] = task_ids
+            return self.base_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs,
+            )
+
+        batch_size = _get_batch_size(input_ids, inputs_embeds)
+        if attention_mask is not None:
+            # concat prompt attention mask
+            prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device)
+            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
+
+        if kwargs.get("position_ids", None) is not None:
+            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
+            kwargs["position_ids"] = None
+        if kwargs.get("token_type_ids", None) is not None:
+            warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
+            kwargs["token_type_ids"] = None
+        kwargs.update(
+            {
+                "attention_mask": attention_mask,
+                "labels": labels,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+            }
+        )
+
+        if peft_config.peft_type == PeftType.PREFIX_TUNING:
+            past_key_values = self.get_prompt(batch_size)
+            return self.base_model(
+                input_ids=input_ids, inputs_embeds=inputs_embeds, past_key_values=past_key_values, **kwargs
+            )
+        else:
+            if inputs_embeds is None:
+                inputs_embeds = self.word_embeddings(input_ids)
+            # concat prompt labels
+            if labels is not None:
+                prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device)
+                kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1)
+            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
+            prompts = prompts.to(inputs_embeds.dtype)
+            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
+            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
+
+    def generate(self, *args, **kwargs):
+        self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation
+        if hasattr(self.base_model, "model"):
+            self.base_model.model.generation_config = self.generation_config
+        else:
+            self.base_model.generation_config = self.generation_config
+        try:
+            outputs = self.base_model.generate(*args, **kwargs)
+        except:
+            self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
+            raise
+        else:
+            self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
+            return outputs
+
+    def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor] = None, **kwargs):
+        peft_config = self.active_peft_config
+        model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)
+
+        # https://github.com/huggingface/transformers/pull/26681/ introduced new cache format
+        # for some architectures which requires a special fix for prompt tuning etc.
+        # TODO: starting with transformers 4.38, all architectures should support caching.
+        uses_transformers_4_38 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.38.0")
+        uses_transformers_4_36 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.36.0")
+        transformers_new_cache_archs = ["llama", "mistral", "persimmon", "phi"]
+        uses_cache = uses_transformers_4_38 or (
+            uses_transformers_4_36 and self.base_model.config.model_type in transformers_new_cache_archs
+        )
+
+        if peft_config.peft_type == PeftType.POLY:
+            model_kwargs["task_ids"] = task_ids
+        if peft_config.is_prompt_learning:
+            if uses_cache and (model_kwargs["past_key_values"] is not None):
+                # change in the logic of `prepare_inputs_for_generation` makes the below code necessary
+                # In prompt learning methods, past key values are longer when compared to the `input_ids`.
+                # As such only consider the last input ids in the autogressive generation phase.
+                if model_kwargs["past_key_values"][0][0].shape[-2] >= model_kwargs["input_ids"].shape[1]:
+                    model_kwargs["input_ids"] = model_kwargs["input_ids"][:, -1:]
+
+            if model_kwargs.get("attention_mask", None) is not None:
+                size = model_kwargs["input_ids"].shape[0], peft_config.num_virtual_tokens
+                prefix_attention_mask = torch.ones(size).to(model_kwargs["input_ids"].device)
+                model_kwargs["attention_mask"] = torch.cat(
+                    (prefix_attention_mask, model_kwargs["attention_mask"]), dim=1
+                )
+
+            if model_kwargs.get("position_ids", None) is not None:
+                warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
+                model_kwargs["position_ids"] = None
+
+            if kwargs.get("token_type_ids", None) is not None:
+                warnings.warn(
+                    "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids"
+                )
+                kwargs["token_type_ids"] = None
+
+            if model_kwargs["past_key_values"] is None and peft_config.peft_type == PeftType.PREFIX_TUNING:
+                past_key_values = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0])
+                model_kwargs["past_key_values"] = past_key_values
+            else:
+                if model_kwargs["past_key_values"] is None:
+                    inputs_embeds = self.word_embeddings(model_kwargs["input_ids"])
+                    prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0], task_ids=task_ids)
+                    prompts = prompts.to(inputs_embeds.dtype)
+                    model_kwargs["inputs_embeds"] = torch.cat((prompts, inputs_embeds), dim=1)
+                    model_kwargs["input_ids"] = None
+
+        # For transformers>=4.38.0 - for some architectures such as Llama, `cache_position` is
+        # passed in the forward pass to keep track of the position ids of the cache. We have to
+        # pop that from `model_kwargs` as `cache_position` is properly created by the model, using the passed
+        # `inputs_embeds`: https://github.com/huggingface/transformers/blob/593230f0a1150ea9c0477b9d859f25daf73c8c33/src/transformers/models/llama/modeling_llama.py#L956
+        _ = model_kwargs.pop("cache_position", None)
+
+        return model_kwargs
+
+
+class PeftModelForSeq2SeqLM(PeftModel):
+    """
+    Peft model for sequence-to-sequence language modeling.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): Base transformer model.
+        peft_config ([`PeftConfig`]): Peft config.
+
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForSeq2SeqLM
+        >>> from peft import PeftModelForSeq2SeqLM, get_peft_config
+
+        >>> config = {
+        ...     "peft_type": "LORA",
+        ...     "task_type": "SEQ_2_SEQ_LM",
+        ...     "inference_mode": False,
+        ...     "r": 8,
+        ...     "target_modules": ["q", "v"],
+        ...     "lora_alpha": 32,
+        ...     "lora_dropout": 0.1,
+        ...     "fan_in_fan_out": False,
+        ...     "enable_lora": None,
+        ...     "bias": "none",
+        ... }
+
+        >>> peft_config = get_peft_config(config)
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> peft_model = PeftModelForSeq2SeqLM(model, peft_config)
+        >>> peft_model.print_trainable_parameters()
+        trainable params: 884736 || all params: 223843584 || trainable%: 0.3952474242013566
+        ```
+    """
+
+    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
+        super().__init__(model, peft_config, adapter_name)
+        self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation
+        self.base_model_prepare_encoder_decoder_kwargs_for_generation = (
+            self.base_model._prepare_encoder_decoder_kwargs_for_generation
+        )
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        task_ids=None,
+        **kwargs,
+    ):
+        peft_config = self.active_peft_config
+        if not peft_config.is_prompt_learning:
+            if peft_config.peft_type == PeftType.POLY:
+                kwargs["task_ids"] = task_ids
+            return self.base_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                decoder_inputs_embeds=decoder_inputs_embeds,
+                labels=labels,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs,
+            )
+
+        batch_size = _get_batch_size(input_ids, inputs_embeds)
+        if decoder_attention_mask is not None:
+            # concat prompt attention mask
+            prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(
+                decoder_attention_mask.device
+            )
+            if peft_config.peft_type not in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]:
+                decoder_attention_mask = torch.cat((prefix_attention_mask, decoder_attention_mask), dim=1)
+
+        if kwargs.get("position_ids", None) is not None:
+            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
+            kwargs["position_ids"] = None
+        if kwargs.get("token_type_ids", None) is not None:
+            warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
+            kwargs["token_type_ids"] = None
+        kwargs.update(
+            {
+                "attention_mask": attention_mask,
+                "decoder_attention_mask": decoder_attention_mask,
+                "labels": labels,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+            }
+        )
+
+        if peft_config.peft_type == PeftType.PREFIX_TUNING:
+            past_key_values = self.get_prompt(batch_size)
+            return self.base_model(
+                input_ids=input_ids,
+                decoder_input_ids=decoder_input_ids,
+                decoder_inputs_embeds=decoder_inputs_embeds,
+                past_key_values=past_key_values,
+                **kwargs,
+            )
+        elif peft_config.peft_type in [PeftType.PROMPT_TUNING, PeftType.P_TUNING]:
+            if inputs_embeds is None:
+                inputs_embeds = self.word_embeddings(input_ids)
+
+            if attention_mask is not None:
+                # concat prompt attention mask
+                prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(
+                    attention_mask.device
+                )
+                kwargs["attention_mask"] = torch.cat((prefix_attention_mask, attention_mask), dim=1)
+
+            prompts = self.get_prompt(batch_size=batch_size)
+            prompts = prompts.to(inputs_embeds.dtype)
+            inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1)
+
+            return self.base_model(
+                inputs_embeds=inputs_embeds,
+                decoder_input_ids=decoder_input_ids,
+                decoder_inputs_embeds=decoder_inputs_embeds,
+                **kwargs,
+            )
+        else:
+            if inputs_embeds is None:
+                inputs_embeds = self.word_embeddings(input_ids)
+            if decoder_inputs_embeds is None and decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+                decoder_inputs_embeds = self.word_embeddings(decoder_input_ids)
+
+            if attention_mask is not None:
+                # concat prompt attention mask
+                prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(
+                    attention_mask.device
+                )
+                kwargs["attention_mask"] = torch.cat((prefix_attention_mask, attention_mask), dim=1)
+            # concat prompt labels
+            if labels is not None:
+                if peft_config.num_transformer_submodules == 1:
+                    kwargs["labels"] = labels
+                elif peft_config.num_transformer_submodules == 2:
+                    prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device)
+                    kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1)
+            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
+            prompts = prompts.to(inputs_embeds.dtype)
+            inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1)
+            if peft_config.num_transformer_submodules == 1:
+                return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
+            elif peft_config.num_transformer_submodules == 2:
+                decoder_inputs_embeds = torch.cat(
+                    (prompts[:, peft_config.num_virtual_tokens :], decoder_inputs_embeds), dim=1
+                )
+                return self.base_model(
+                    inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, **kwargs
+                )
+
+    def generate(self, **kwargs):
+        peft_config = self.active_peft_config
+        self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation
+        self.base_model._prepare_encoder_decoder_kwargs_for_generation = (
+            self._prepare_encoder_decoder_kwargs_for_generation
+        )
+        try:
+            if not peft_config.is_prompt_learning:
+                outputs = self.base_model.generate(**kwargs)
+            else:
+                if "input_ids" not in kwargs:
+                    raise ValueError("input_ids must be provided for Peft model generation")
+                if kwargs.get("position_ids", None) is not None:
+                    warnings.warn(
+                        "Position ids are not supported for parameter efficient tuning. Ignoring position ids."
+                    )
+                    kwargs["position_ids"] = None
+                if kwargs.get("token_type_ids", None) is not None:
+                    warnings.warn(
+                        "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids"
+                    )
+                    kwargs["token_type_ids"] = None
+
+                if peft_config.peft_type == PeftType.PREFIX_TUNING:
+                    outputs = self.base_model.generate(**kwargs)
+                elif peft_config.peft_type in [
+                    PeftType.PROMPT_TUNING,
+                    PeftType.P_TUNING,
+                    PeftType.MULTITASK_PROMPT_TUNING,
+                ]:
+                    kwargs = deepcopy(kwargs)
+
+                    if "encoder_outputs" in kwargs:
+                        del kwargs["encoder_outputs"]
+                        warnings.warn(
+                            "`encoder_outputs` should not be passed to `generate` when using prompt tuning. Ignoring it."
+                        )
+
+                    input_ids = kwargs.pop("input_ids")
+                    inputs_embeds = self.word_embeddings(input_ids)
+                    batch_size = inputs_embeds.shape[0]
+                    prompts = self.get_prompt(batch_size=batch_size, task_ids=kwargs.pop("task_ids", None))
+                    prompts = prompts.to(inputs_embeds.dtype)
+
+                    inputs_embeds = torch.cat((prompts[:, : peft_config.num_virtual_tokens], inputs_embeds), dim=1)
+                    kwargs["inputs_embeds"] = inputs_embeds
+
+                    if "attention_mask" in kwargs:
+                        prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(
+                            kwargs["attention_mask"].device
+                        )
+                        kwargs["attention_mask"] = torch.cat((prefix_attention_mask, kwargs["attention_mask"]), dim=1)
+
+                    return self.base_model.generate(**kwargs)
+                else:
+                    raise NotImplementedError
+        except:
+            self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
+            self.base_model._prepare_encoder_decoder_kwargs_for_generation = (
+                self.base_model_prepare_encoder_decoder_kwargs_for_generation
+            )
+            raise
+        else:
+            self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation
+            self.base_model._prepare_encoder_decoder_kwargs_for_generation = (
+                self.base_model_prepare_encoder_decoder_kwargs_for_generation
+            )
+            return outputs
+
+    def prepare_inputs_for_generation(self, *args, task_ids: torch.Tensor = None, **kwargs):
+        peft_config = self.active_peft_config
+        model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs)
+        if peft_config.peft_type == PeftType.POLY:
+            model_kwargs["task_ids"] = task_ids
+        if model_kwargs["past_key_values"] is None and peft_config.peft_type == PeftType.PREFIX_TUNING:
+            batch_size = model_kwargs["decoder_input_ids"].shape[0]
+            past_key_values = self.get_prompt(batch_size)
+            model_kwargs["past_key_values"] = past_key_values
+
+        return model_kwargs
+
+
+class PeftModelForTokenClassification(PeftModel):
+    """
+    Peft model for token classification tasks.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): Base transformer model.
+        peft_config ([`PeftConfig`]): Peft config.
+
+    **Attributes**:
+        - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model.
+        - **cls_layer_name** (`str`) -- The name of the classification layer.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForSequenceClassification
+        >>> from peft import PeftModelForTokenClassification, get_peft_config
+
+        >>> config = {
+        ...     "peft_type": "PREFIX_TUNING",
+        ...     "task_type": "TOKEN_CLS",
+        ...     "inference_mode": False,
+        ...     "num_virtual_tokens": 20,
+        ...     "token_dim": 768,
+        ...     "num_transformer_submodules": 1,
+        ...     "num_attention_heads": 12,
+        ...     "num_layers": 12,
+        ...     "encoder_hidden_size": 768,
+        ...     "prefix_projection": False,
+        ...     "postprocess_past_key_value_function": None,
+        ... }
+
+        >>> peft_config = get_peft_config(config)
+        >>> model = AutoModelForTokenClassification.from_pretrained("bert-base-cased")
+        >>> peft_model = PeftModelForTokenClassification(model, peft_config)
+        >>> peft_model.print_trainable_parameters()
+        trainable params: 370178 || all params: 108680450 || trainable%: 0.3406113979101117
+        ```
+    """
+
+    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig = None, adapter_name: str = "default") -> None:
+        super().__init__(model, peft_config, adapter_name)
+        if self.modules_to_save is None:
+            self.modules_to_save = {"classifier", "score"}
+        else:
+            self.modules_to_save.update({"classifier", "score"})
+
+        for name, _ in self.base_model.named_children():
+            if any(module_name in name for module_name in self.modules_to_save):
+                self.cls_layer_name = name
+                break
+
+        # to make sure classifier layer is trainable
+        _set_trainable(self, adapter_name)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        task_ids=None,
+        **kwargs,
+    ):
+        peft_config = self.active_peft_config
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if not peft_config.is_prompt_learning:
+            if peft_config.peft_type == PeftType.POLY:
+                kwargs["task_ids"] = task_ids
+            return self.base_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs,
+            )
+
+        batch_size = _get_batch_size(input_ids, inputs_embeds)
+        if attention_mask is not None:
+            # concat prompt attention mask
+            prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device)
+            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
+        if kwargs.get("position_ids", None) is not None:
+            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
+            kwargs["position_ids"] = None
+        kwargs.update(
+            {
+                "attention_mask": attention_mask,
+                "labels": labels,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+            }
+        )
+
+        if peft_config.peft_type == PeftType.PREFIX_TUNING:
+            return self._prefix_tuning_forward(input_ids=input_ids, **kwargs)
+        else:
+            if kwargs.get("token_type_ids", None) is not None:
+                kwargs["token_type_ids"] = torch.cat(
+                    (
+                        torch.zeros(batch_size, peft_config.num_virtual_tokens).to(self.word_embeddings.weight.device),
+                        kwargs["token_type_ids"],
+                    ),
+                    dim=1,
+                ).long()
+            if inputs_embeds is None:
+                inputs_embeds = self.word_embeddings(input_ids)
+            prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids)
+            prompts = prompts.to(inputs_embeds.dtype)
+            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
+            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
+
+    def _prefix_tuning_forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        batch_size = _get_batch_size(input_ids, inputs_embeds)
+        past_key_values = self.get_prompt(batch_size)
+        fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys())
+        kwargs.update(
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "inputs_embeds": inputs_embeds,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+                "past_key_values": past_key_values,
+            }
+        )
+        if "past_key_values" in fwd_params:
+            return self.base_model(labels=labels, **kwargs)
+        else:
+            transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name)
+            fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys())
+            if "past_key_values" not in fwd_params:
+                raise ValueError("Model does not support past key values which are required for prefix tuning.")
+            outputs = transformer_backbone_name(**kwargs)
+            sequence_output = outputs[0]
+            if "dropout" in [name for name, _ in list(self.base_model.named_children())]:
+                sequence_output = self.base_model.dropout(sequence_output)
+            logits = self.base_model.get_submodule(self.cls_layer_name)(sequence_output)
+
+            loss = None
+            if labels is not None:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+            if not return_dict:
+                output = (logits,) + outputs[2:]
+                return ((loss,) + output) if loss is not None else output
+
+            return TokenClassifierOutput(
+                loss=loss,
+                logits=logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+
+
+class PeftModelForQuestionAnswering(PeftModel):
+    """
+    Peft model for extractive question answering.
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): Base transformer model.
+        peft_config ([`PeftConfig`]): Peft config.
+
+    **Attributes**:
+        - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model.
+        - **cls_layer_name** (`str`) -- The name of the classification layer.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForQuestionAnswering
+        >>> from peft import PeftModelForQuestionAnswering, get_peft_config
+
+        >>> config = {
+        ...     "peft_type": "LORA",
+        ...     "task_type": "QUESTION_ANS",
+        ...     "inference_mode": False,
+        ...     "r": 16,
+        ...     "target_modules": ["query", "value"],
+        ...     "lora_alpha": 32,
+        ...     "lora_dropout": 0.05,
+        ...     "fan_in_fan_out": False,
+        ...     "bias": "none",
+        ... }
+
+        >>> peft_config = get_peft_config(config)
+        >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-base-cased")
+        >>> peft_model = PeftModelForQuestionAnswering(model, peft_config)
+        >>> peft_model.print_trainable_parameters()
+        trainable params: 592900 || all params: 108312580 || trainable%: 0.5473971721475013
+        ```
+    """
+
+    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default") -> None:
+        super().__init__(model, peft_config, adapter_name)
+        if self.modules_to_save is None:
+            self.modules_to_save = {"qa_outputs"}
+        else:
+            self.modules_to_save.update({"qa_outputs"})
+
+        for name, _ in self.base_model.named_children():
+            if any(module_name in name for module_name in self.modules_to_save):
+                self.cls_layer_name = name
+                break
+
+        # to make sure classifier layer is trainable
+        _set_trainable(self, adapter_name)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        task_ids=None,
+        **kwargs,
+    ):
+        peft_config = self.active_peft_config
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if not peft_config.is_prompt_learning:
+            if peft_config.peft_type == PeftType.POLY:
+                kwargs["task_ids"] = task_ids
+            return self.base_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                start_positions=start_positions,
+                end_positions=end_positions,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs,
+            )
+
+        batch_size = _get_batch_size(input_ids, inputs_embeds)
+        if attention_mask is not None:
+            # concat prompt attention mask
+            prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device)
+            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
+        if kwargs.get("position_ids", None) is not None:
+            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
+            kwargs["position_ids"] = None
+        kwargs.update(
+            {
+                "attention_mask": attention_mask,
+                "start_positions": start_positions,
+                "end_positions": end_positions,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+            }
+        )
+
+        if peft_config.peft_type == PeftType.PREFIX_TUNING:
+            return self._prefix_tuning_forward(input_ids=input_ids, **kwargs)
+        else:
+            if kwargs.get("token_type_ids", None) is not None:
+                kwargs["token_type_ids"] = torch.cat(
+                    (
+                        torch.zeros(batch_size, peft_config.num_virtual_tokens).to(self.word_embeddings.weight.device),
+                        kwargs["token_type_ids"],
+                    ),
+                    dim=1,
+                ).long()
+            if inputs_embeds is None:
+                inputs_embeds = self.word_embeddings(input_ids)
+            prompts = self.get_prompt(batch_size=batch_size)
+            prompts = prompts.to(inputs_embeds.dtype)
+            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
+            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
+
+    def _prefix_tuning_forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        batch_size = _get_batch_size(input_ids, inputs_embeds)
+        past_key_values = self.get_prompt(batch_size)
+        fwd_params = list(inspect.signature(self.base_model.forward).parameters.keys())
+        kwargs.update(
+            {
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "inputs_embeds": inputs_embeds,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+                "past_key_values": past_key_values,
+            }
+        )
+        if "past_key_values" in fwd_params:
+            return self.base_model(start_positions=start_positions, end_positions=end_positions, **kwargs)
+        else:
+            transformer_backbone_name = self.base_model.get_submodule(self.transformer_backbone_name)
+            fwd_params = list(inspect.signature(transformer_backbone_name.forward).parameters.keys())
+            if "past_key_values" not in fwd_params:
+                raise ValueError("Model does not support past key values which are required for prefix tuning.")
+            outputs = transformer_backbone_name(**kwargs)
+            sequence_output = outputs[0]
+            if "dropout" in [name for name, _ in list(self.base_model.named_children())]:
+                sequence_output = self.base_model.dropout(sequence_output)
+            logits = self.base_model.get_submodule(self.cls_layer_name)(sequence_output)
+            start_logits, end_logits = logits.split(1, dim=-1)
+            start_logits = start_logits.squeeze(-1).contiguous()
+            end_logits = end_logits.squeeze(-1).contiguous()
+
+            total_loss = None
+            if start_positions is not None and end_positions is not None:
+                # If we are on multi-GPU, split add a dimension
+                if len(start_positions.size()) > 1:
+                    start_positions = start_positions.squeeze(-1)
+                if len(end_positions.size()) > 1:
+                    end_positions = end_positions.squeeze(-1)
+                # sometimes the start/end positions are outside our model inputs, we ignore these terms
+                ignored_index = start_logits.size(1)
+                start_positions = start_positions.clamp(0, ignored_index)
+                end_positions = end_positions.clamp(0, ignored_index)
+
+                loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+                start_loss = loss_fct(start_logits, start_positions)
+                end_loss = loss_fct(end_logits, end_positions)
+                total_loss = (start_loss + end_loss) / 2
+
+            if not return_dict:
+                output = (start_logits, end_logits) + outputs[2:]
+                return ((total_loss,) + output) if total_loss is not None else output
+
+            return QuestionAnsweringModelOutput(
+                loss=total_loss,
+                start_logits=start_logits,
+                end_logits=end_logits,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+
+
+class PeftModelForFeatureExtraction(PeftModel):
+    """
+    Peft model for extracting features/embeddings from transformer models
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): Base transformer model.
+        peft_config ([`PeftConfig`]): Peft config.
+
+    **Attributes**:
+        - **config** ([`~transformers.PretrainedConfig`]) -- The configuration object of the base model.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModel
+        >>> from peft import PeftModelForFeatureExtraction, get_peft_config
+
+        >>> config = {
+        ...     "peft_type": "LORA",
+        ...     "task_type": "FEATURE_EXTRACTION",
+        ...     "inference_mode": False,
+        ...     "r": 16,
+        ...     "target_modules": ["query", "value"],
+        ...     "lora_alpha": 32,
+        ...     "lora_dropout": 0.05,
+        ...     "fan_in_fan_out": False,
+        ...     "bias": "none",
+        ... }
+        >>> peft_config = get_peft_config(config)
+        >>> model = AutoModel.from_pretrained("bert-base-cased")
+        >>> peft_model = PeftModelForFeatureExtraction(model, peft_config)
+        >>> peft_model.print_trainable_parameters()
+        ```
+    """
+
+    def __init__(self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default"):
+        super().__init__(model, peft_config, adapter_name)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        task_ids=None,
+        **kwargs,
+    ):
+        peft_config = self.active_peft_config
+        if not peft_config.is_prompt_learning:
+            if peft_config.peft_type == PeftType.POLY:
+                kwargs["task_ids"] = task_ids
+            return self.base_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                **kwargs,
+            )
+
+        batch_size = _get_batch_size(input_ids, inputs_embeds)
+        if attention_mask is not None:
+            # concat prompt attention mask
+            prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device)
+            attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
+
+        if kwargs.get("position_ids", None) is not None:
+            warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.")
+            kwargs["position_ids"] = None
+        if kwargs.get("token_type_ids", None) is not None:
+            warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids")
+            kwargs["token_type_ids"] = None
+        kwargs.update(
+            {
+                "attention_mask": attention_mask,
+                "output_attentions": output_attentions,
+                "output_hidden_states": output_hidden_states,
+                "return_dict": return_dict,
+            }
+        )
+
+        if peft_config.peft_type == PeftType.PREFIX_TUNING:
+            past_key_values = self.get_prompt(batch_size)
+            return self.base_model(input_ids=input_ids, past_key_values=past_key_values, **kwargs)
+        else:
+            if inputs_embeds is None:
+                inputs_embeds = self.word_embeddings(input_ids)
+            prompts = self.get_prompt(batch_size=batch_size)
+            prompts = prompts.to(inputs_embeds.dtype)
+            inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1)
+            return self.base_model(inputs_embeds=inputs_embeds, **kwargs)
diff --git a/MoRA/peft_mora/py.typed b/MoRA/peft_mora/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MoRA/peft_mora/tuners/__init__.py b/MoRA/peft_mora/tuners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47baa668177ec80b3ec142f1555c5b90f13dcca
--- /dev/null
+++ b/MoRA/peft_mora/tuners/__init__.py
@@ -0,0 +1,32 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all
+
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel
+from .lora import LoraConfig, LoraModel, LoftQConfig
+from .loha import LoHaConfig, LoHaModel
+from .lokr import LoKrConfig, LoKrModel
+from .ia3 import IA3Config, IA3Model
+from .adalora import AdaLoraConfig, AdaLoraModel
+from .p_tuning import PromptEncoder, PromptEncoderConfig, PromptEncoderReparameterizationType
+from .prefix_tuning import PrefixEncoder, PrefixTuningConfig
+from .prompt_tuning import PromptEmbedding, PromptTuningConfig, PromptTuningInit
+from .multitask_prompt_tuning import MultitaskPromptEmbedding, MultitaskPromptTuningConfig, MultitaskPromptTuningInit
+from .oft import OFTConfig, OFTModel
+from .mixed import MixedModel
+from .poly import PolyConfig, PolyModel
diff --git a/MoRA/peft_mora/tuners/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..29d5d895e7fa6671b6cd6268273a65d5b2e69f6a
Binary files /dev/null and b/MoRA/peft_mora/tuners/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/__pycache__/lycoris_utils.cpython-312.pyc b/MoRA/peft_mora/tuners/__pycache__/lycoris_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fceaf8c5fbd020a8a698de4100be119633135e6
Binary files /dev/null and b/MoRA/peft_mora/tuners/__pycache__/lycoris_utils.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/__pycache__/tuners_utils.cpython-312.pyc b/MoRA/peft_mora/tuners/__pycache__/tuners_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16c1d89adf0b2513d2fd824ac4eefef8d4a185f7
Binary files /dev/null and b/MoRA/peft_mora/tuners/__pycache__/tuners_utils.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adalora/__init__.py b/MoRA/peft_mora/tuners/adalora/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf041fb128477e5c1906e4f82b37b3390e4b46b2
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adalora/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .config import AdaLoraConfig
+from .gptq import SVDQuantLinear
+from .layer import AdaLoraLayer, RankAllocator, SVDLinear
+from .model import AdaLoraModel
+
+
+__all__ = ["AdaLoraConfig", "AdaLoraLayer", "AdaLoraModel", "SVDLinear", "RankAllocator", "SVDQuantLinear"]
+
+
+def __getattr__(name):
+    if (name == "SVDLinear8bitLt") and is_bnb_available():
+        from .bnb import SVDLinear8bitLt
+
+        return SVDLinear8bitLt
+
+    if (name == "SVDLinear4bit") and is_bnb_4bit_available():
+        from .bnb import SVDLinear4bit
+
+        return SVDLinear4bit
+
+    raise AttributeError(f"module {__name__} has no attribute {name}")
diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5372403ad2dd6ce319a9c592f59a67242c13d708
Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b419d8b5c67544c7dececa1af90d9d1180cad14
Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/gptq.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/gptq.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6080380ad77ad89feaf6a09aa462e511fe81cded
Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/gptq.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/layer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ed4c21ce63102e8836e7032b378704dffcc6b22d
Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/layer.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adalora/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/adalora/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37aa099ab94a7abc2bd1e9d7e32ebf8953afa16f
Binary files /dev/null and b/MoRA/peft_mora/tuners/adalora/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adalora/bnb.py b/MoRA/peft_mora/tuners/adalora/bnb.py
new file mode 100644
index 0000000000000000000000000000000000000000..59ef4ae90863d4d8e708195628e88e3d8659fec2
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adalora/bnb.py
@@ -0,0 +1,145 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import torch
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .layer import AdaLoraLayer
+
+
+if is_bnb_available():
+
+    class SVDLinear8bitLt(torch.nn.Module, AdaLoraLayer):
+        # Low-rank matrix for SVD-based adaptation
+        def __init__(
+            self,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            AdaLoraLayer.__init__(self, base_layer)
+            # Freezing the pre-trained weight matrix
+            self.get_base_layer().weight.requires_grad = False
+
+            self._active_adapter = adapter_name
+            self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            # note: no check for self.merged because merging is not supported (yet)
+            result = self.base_layer(x)
+
+            if self.disable_adapters:
+                return result
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                requires_conversion = not torch.is_autocast_enabled()
+                if requires_conversion:
+                    expected_dtype = result.dtype
+                    if x.dtype != torch.float32:
+                        x = x.float()
+
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                lora_E = self.lora_E[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                ranknum = self.ranknum[active_adapter] + 1e-5
+
+                output = dropout(x) @ (lora_A * lora_E).T @ lora_B.T
+                if requires_conversion:
+                    output = output.to(expected_dtype)
+                output = output * scaling / ranknum
+                # inplace operation on view is forbidden for MatMul8bitLtBackward, so avoid it
+                result = result + output
+            return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "adalora." + rep
+
+
+if is_bnb_4bit_available():
+
+    class SVDLinear4bit(torch.nn.Module, AdaLoraLayer):
+        # Low-rank matrix for SVD-based adaptation
+        def __init__(
+            self,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            AdaLoraLayer.__init__(self, base_layer)
+            # Freezing the pre-trained weight matrix
+            self.get_base_layer().weight.requires_grad = False
+
+            self._active_adapter = adapter_name
+            self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+
+        def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+            # note: no check for self.merged because merging is not supported (yet)
+            result = self.base_layer(x, *args, **kwargs)
+
+            if self.disable_adapters:
+                return result
+
+            # As per Tim Dettmers, for 4bit, we need to defensively clone here.
+            # The reason is that in some cases, an error can occur that backprop
+            # does not work on a manipulated view. This issue may be solved with
+            # newer PyTorch versions but this would need extensive testing to be
+            # sure.
+            result = result.clone()
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                lora_E = self.lora_E[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                ranknum = self.ranknum[active_adapter] + 1e-5
+
+                requires_conversion = not torch.is_autocast_enabled()
+                if requires_conversion:
+                    expected_dtype = result.dtype
+                    compute_dtype = lora_A.dtype
+                    if x.dtype != compute_dtype:
+                        x = x.to(compute_dtype)
+
+                output = dropout(x) @ (lora_A * lora_E).T @ lora_B.T
+                if requires_conversion:
+                    output = output.to(expected_dtype)
+                output = output * scaling / ranknum
+                result += output
+            return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "adalora." + rep
diff --git a/MoRA/peft_mora/tuners/adalora/config.py b/MoRA/peft_mora/tuners/adalora/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4595300a8ad41be2fc947580a6bd85459ed3259
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adalora/config.py
@@ -0,0 +1,52 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+from peft_mora.tuners.lora import LoraConfig
+from peft_mora.utils import PeftType
+
+
+@dataclass
+class AdaLoraConfig(LoraConfig):
+    """
+    This is the configuration class to store the configuration of a [`~peft.AdaLora`].
+
+    Args:
+        target_r (`int`): The target average rank of incremental matrix.
+        init_r (`int`): The initial rank for each incremental matrix.
+        tinit (`int`): The steps of initial fine-tuning warmup.
+        tfinal (`int`): The step of final fine-tuning.
+        deltaT (`int`): The time internval between two budget allocations.
+        beta1 (`float`): The hyperparameter of EMA for sensitivity smoothing.
+        beta2 (`float`): The hyperparameter of EMA for undertainty quantification.
+        orth_reg_weight (`float`): The coefficient of orthogonal regularization.
+        total_step (`int`): The total training steps that should be specified before training.
+        rank_pattern (`list`): The allocated rank for each weight matrix by RankAllocator.
+    """
+
+    target_r: int = field(default=8, metadata={"help": "Target Lora matrix dimension."})
+    init_r: int = field(default=12, metadata={"help": "Initial Lora matrix dimension."})
+    tinit: int = field(default=0, metadata={"help": "The steps of initial warmup."})
+    tfinal: int = field(default=0, metadata={"help": "The steps of final warmup."})
+    deltaT: int = field(default=1, metadata={"help": "Step interval of rank allocation."})
+    beta1: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."})
+    beta2: float = field(default=0.85, metadata={"help": "Hyperparameter of EMA."})
+    orth_reg_weight: float = field(default=0.5, metadata={"help": "The orthogonal regularization coefficient."})
+    total_step: Optional[int] = field(default=None, metadata={"help": "The total training steps."})
+    rank_pattern: Optional[dict] = field(default=None, metadata={"help": "The saved rank pattern."})
+
+    def __post_init__(self):
+        self.peft_type = PeftType.ADALORA
diff --git a/MoRA/peft_mora/tuners/adalora/gptq.py b/MoRA/peft_mora/tuners/adalora/gptq.py
new file mode 100644
index 0000000000000000000000000000000000000000..910377c5db5908727ed4753fd15b24e68821ce00
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adalora/gptq.py
@@ -0,0 +1,72 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+from .layer import AdaLoraLayer
+
+
+class SVDQuantLinear(torch.nn.Module, AdaLoraLayer):
+    def __init__(
+        self,
+        base_layer,
+        adapter_name,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        init_lora_weights: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        AdaLoraLayer.__init__(self, base_layer)
+
+        # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter
+        # for backwards compatibility
+        self.quant_linear_module = base_layer
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        result = self.quant_linear_module(x)
+
+        if self.disable_adapters:
+            return result
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            lora_E = self.lora_E[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+            ranknum = self.ranknum[active_adapter] + 1e-5
+
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                if x.dtype != torch.float32:
+                    x = x.float()
+
+            output = (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum
+            # TODO: here, the dtype conversion is applied on the *whole expression*,
+            # not the intermediate result, unlike for SVDLinear8bitLT and
+            # SVDLinear4bit, is that correct?
+            if requires_conversion:
+                output = output.to(expected_dtype)
+            result += output
+        return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "adalora." + rep
diff --git a/MoRA/peft_mora/tuners/adalora/layer.py b/MoRA/peft_mora/tuners/adalora/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc6835e9fbf67dae4fb751d1968b0a99387323bc
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adalora/layer.py
@@ -0,0 +1,346 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Any, List, Optional
+
+import torch
+from torch import nn
+
+from peft_mora.tuners.lora import LoraLayer
+from peft_mora.tuners.tuners_utils import check_adapters_to_merge
+from peft_mora.utils import transpose
+
+
+class AdaLoraLayer(LoraLayer):
+    # List all names of layers that may contain adapter weights
+    # Note: ranknum doesn't need to be included as it is not an nn.Module
+    adapter_layer_names = ("lora_A", "lora_B", "lora_E", "lora_embedding_A", "lora_embedding_B")
+    # other_param_names is defined in LoraLayer
+
+    def __init__(self, base_layer: nn.Module) -> None:
+        super().__init__(base_layer)
+        self.lora_E = nn.ParameterDict({})
+        self.lora_A = nn.ParameterDict({})
+        self.lora_B = nn.ParameterDict({})
+        self.ranknum = nn.ParameterDict({})
+
+    def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout[adapter_name] = lora_dropout_layer
+        # Actual trainable parameters
+        # Right singular vectors
+        self.lora_A[adapter_name] = nn.Parameter(torch.randn(r, self.in_features))
+        # Singular values
+        self.lora_E[adapter_name] = nn.Parameter(torch.randn(r, 1))
+        # Left singular vectors
+        self.lora_B[adapter_name] = nn.Parameter(torch.randn(self.out_features, r))
+        # The current rank
+        self.ranknum[adapter_name] = nn.Parameter(torch.randn(1), requires_grad=False)
+        self.ranknum[adapter_name].data.fill_(float(r))
+        self.ranknum[adapter_name].requires_grad = False
+        self.scaling[adapter_name] = lora_alpha if lora_alpha > 0 else float(r)
+        if init_lora_weights:
+            self.reset_lora_parameters(adapter_name)
+
+        if hasattr(self.get_base_layer(), "qweight"):
+            # QuantLinear
+            self.to(self.get_base_layer().qweight.device)
+        else:
+            self.to(self.get_base_layer().weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def reset_lora_parameters(self, adapter_name):
+        if adapter_name in self.lora_A.keys():
+            nn.init.normal_(self.lora_E[adapter_name], mean=0.0, std=0.02)
+            nn.init.normal_(self.lora_A[adapter_name], mean=0.0, std=0.02)
+            nn.init.normal_(self.lora_B[adapter_name], mean=0.0, std=0.02)
+
+
+class SVDLinear(nn.Module, AdaLoraLayer):
+    # SVD-based adaptation by a dense layer
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,
+        init_lora_weights: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        AdaLoraLayer.__init__(self, base_layer)
+        # Freezing the pre-trained weight matrix
+        self.get_base_layer().weight.requires_grad = False
+
+        self.fan_in_fan_out = fan_in_fan_out
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights)
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            base_layer = self.get_base_layer()
+            if active_adapter in self.lora_A.keys():
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = base_layer.weight.data.clone()
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = orig_weights
+                else:
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.lora_A.keys():
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        return (
+            transpose(self.lora_B[adapter] @ (self.lora_A[adapter] * self.lora_E[adapter]), self.fan_in_fan_out)
+            * self.scaling[adapter]
+            / (self.ranknum[adapter] + 1e-5)
+        )
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                lora_E = self.lora_E[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                ranknum = self.ranknum[active_adapter] + 1e-5
+
+                x = x.to(lora_A.dtype)
+                result += (dropout(x) @ (lora_A * lora_E).T @ lora_B.T) * scaling / ranknum
+
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "adalora." + rep
+
+
+class RankAllocator:
+    """
+    The RankAllocator for AdaLoraModel. Paper: https://openreview.net/pdf?id=lq62uWRJjiY
+
+    Args:
+        config ([`AdaLoraConfig`]): The configuration of the AdaLora model.
+        model: the model that we apply AdaLoRA to.
+
+    """
+
+    def __init__(self, model, peft_config, adapter_name):
+        self.peft_config = peft_config
+        self.adapter_name = adapter_name
+        self.beta1 = peft_config.beta1
+        self.beta2 = peft_config.beta2
+        assert self.beta1 > 0 and self.beta1 < 1
+        assert self.beta2 > 0 and self.beta2 < 1
+
+        self.reset_ipt()
+        self._set_budget_scheduler(model)
+
+    def set_total_step(self, total_step):
+        self.peft_config.total_step = total_step
+
+    def reset_ipt(self):
+        self.ipt = {}
+        self.exp_avg_ipt = {}
+        self.exp_avg_unc = {}
+
+    def _set_budget_scheduler(self, model):
+        self.init_bgt = 0
+        self.name_set = set()
+        for n, p in model.named_parameters():
+            if f"lora_A.{self.adapter_name}" in n:
+                self.init_bgt += p.size(0)
+                self.name_set.add(n.replace("lora_A", "%s"))
+        self.name_set = sorted(self.name_set)
+        # The total final rank budget
+        self.target_bgt = self.peft_config.target_r * len(self.name_set)
+
+    def budget_schedule(self, step: int):
+        tinit = self.peft_config.tinit
+        tfinal = self.peft_config.tfinal
+        total_step = self.peft_config.total_step
+        # Initial warmup
+        if step <= tinit:
+            budget = self.init_bgt
+            mask_ind = False
+        # Final fine-tuning
+        elif step > total_step - tfinal:
+            budget = self.target_bgt
+            mask_ind = True
+        else:
+            # Budget decreasing with a cubic scheduler
+            mul_coeff = 1 - (step - tinit) / (total_step - tfinal - tinit)
+            budget = int((self.init_bgt - self.target_bgt) * (mul_coeff**3) + self.target_bgt)
+            mask_ind = True if step % self.peft_config.deltaT == 0 else False
+        return budget, mask_ind
+
+    def update_ipt(self, model):
+        # Update the sensitivity and uncertainty for every weight
+        for n, p in model.named_parameters():
+            if "lora_" in n and self.adapter_name in n:
+                if n not in self.ipt:
+                    self.ipt[n] = torch.zeros_like(p)
+                    self.exp_avg_ipt[n] = torch.zeros_like(p)
+                    self.exp_avg_unc[n] = torch.zeros_like(p)
+                with torch.no_grad():
+                    self.ipt[n] = (p * p.grad).abs().detach()
+                    # Sensitivity smoothing
+                    self.exp_avg_ipt[n] = self.beta1 * self.exp_avg_ipt[n] + (1 - self.beta1) * self.ipt[n]
+                    # Uncertainty quantification
+                    self.exp_avg_unc[n] = (
+                        self.beta2 * self.exp_avg_unc[n] + (1 - self.beta2) * (self.ipt[n] - self.exp_avg_ipt[n]).abs()
+                    )
+
+    def _element_score(self, n):
+        return self.exp_avg_ipt[n] * self.exp_avg_unc[n]
+
+    def _combine_ipt(self, ipt_E, ipt_AB):
+        ipt_AB = ipt_AB.sum(dim=1, keepdim=False)
+        sum_ipt = ipt_E.view(-1) + ipt_AB.view(-1)
+        return sum_ipt
+
+    def mask_to_budget(self, model, budget):
+        value_ipt = {}
+        vector_ipt = {}
+        triplet_ipt = {}
+        # Get the importance score for A, E, B
+        for n, p in model.named_parameters():
+            if f"lora_A.{self.adapter_name}" in n:
+                entry_ipt = self._element_score(n)
+                comb_ipt = torch.mean(entry_ipt, dim=1, keepdim=True)
+                name_m = n.replace("lora_A", "%s")
+                if name_m not in vector_ipt:
+                    vector_ipt[name_m] = [comb_ipt]
+                else:
+                    vector_ipt[name_m].append(comb_ipt)
+            if f"lora_B.{self.adapter_name}" in n:
+                entry_ipt = self._element_score(n)
+                comb_ipt = torch.mean(entry_ipt, dim=0, keepdim=False).view(-1, 1)
+                name_m = n.replace("lora_B", "%s")
+                if name_m not in vector_ipt:
+                    vector_ipt[name_m] = [comb_ipt]
+                else:
+                    vector_ipt[name_m].append(comb_ipt)
+            if f"lora_E.{self.adapter_name}" in n:
+                entry_ipt = self._element_score(n)
+                name_m = n.replace("lora_E", "%s")
+                value_ipt[name_m] = entry_ipt
+
+        all_score = []
+        # Calculate the score for each triplet
+        for name_m in vector_ipt:
+            ipt_E = value_ipt[name_m]
+            ipt_AB = torch.cat(vector_ipt[name_m], dim=1)
+            sum_ipt = self._combine_ipt(ipt_E, ipt_AB)
+            name_E = name_m % "lora_E"
+            triplet_ipt[name_E] = sum_ipt.view(-1, 1)
+            all_score.append(sum_ipt.view(-1))
+
+        # Get the threshold by ranking ipt
+        mask_threshold = torch.kthvalue(
+            torch.cat(all_score),
+            k=self.init_bgt - budget,
+        )[0].item()
+
+        rank_pattern = {}
+        # Mask the unimportant triplets
+        with torch.no_grad():
+            for n, p in model.named_parameters():
+                if f"lora_E.{self.adapter_name}" in n:
+                    p.masked_fill_(triplet_ipt[n] <= mask_threshold, 0.0)
+                    rank_pattern[n] = (~(triplet_ipt[n] <= mask_threshold)).view(-1).tolist()
+        return rank_pattern
+
+    def update_and_allocate(self, model, global_step, force_mask=False):
+        # # Update the importance score and allocate the budget
+        if global_step < self.peft_config.total_step - self.peft_config.tfinal:
+            self.update_ipt(model)
+        budget, mask_ind = self.budget_schedule(global_step)
+        # Allocate the budget according to importance scores
+        if mask_ind or force_mask:
+            rank_pattern = self.mask_to_budget(model, budget)
+        else:
+            rank_pattern = None
+        return budget, rank_pattern
+
+    def mask_using_rank_pattern(self, model, rank_pattern):
+        # Mask the unimportant triplets
+        is_adapter_name_truncated = False
+        if self.adapter_name not in next(iter(rank_pattern.keys())):
+            is_adapter_name_truncated = True
+
+        with torch.no_grad():
+            for n, p in model.named_parameters():
+                if f"lora_E.{self.adapter_name}" in n:
+                    key = n if not is_adapter_name_truncated else n.replace(f".{self.adapter_name}", "")
+                    mask = torch.Tensor(rank_pattern[key]).unsqueeze(-1).to(p.device)
+                    p.masked_fill_(~mask.bool(), 0.0)
diff --git a/MoRA/peft_mora/tuners/adalora/model.py b/MoRA/peft_mora/tuners/adalora/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b9abc3e2152deff562d283356d3f8e89df250f4
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adalora/model.py
@@ -0,0 +1,346 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import torch
+from transformers.pytorch_utils import Conv1D
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft_mora.tuners.lora import LoraConfig, LoraModel
+from peft_mora.tuners.tuners_utils import BaseTunerLayer
+from peft_mora.utils import (
+    TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
+    _freeze_adapter,
+    _get_submodules,
+    get_auto_gptq_quant_linear,
+    get_quantization_config,
+)
+
+from .gptq import SVDQuantLinear
+from .layer import AdaLoraLayer, RankAllocator, SVDLinear
+
+
+class AdaLoraModel(LoraModel):
+    """
+    Creates AdaLoRA (Adaptive LoRA) model from a pretrained transformers model. Paper:
+    https://openreview.net/forum?id=lq62uWRJjiY
+
+    Args:
+        model ([`transformers.PreTrainedModel`]): The model to be adapted.
+        config ([`AdaLoraConfig`]): The configuration of the AdaLora model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The AdaLora model.
+
+    Example::
+
+        >>> from transformers import AutoModelForSeq2SeqLM, LoraConfig >>> from peft import AdaLoraModel, AdaLoraConfig
+        >>> config = AdaLoraConfig(
+                peft_type="ADALORA", task_type="SEQ_2_SEQ_LM", r=8, lora_alpha=32, target_modules=["q", "v"],
+                lora_dropout=0.01,
+            )
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") >>> model = AdaLoraModel(model, config, "default")
+
+    **Attributes**:
+        - **model** ([`transformers.PreTrainedModel`]) -- The model to be adapted.
+        - **peft_config** ([`AdaLoraConfig`]): The configuration of the AdaLora model.
+    """
+
+    # Note: don't redefine prefix here, it should be inherited from LoraModel
+
+    def __init__(self, model, config, adapter_name):
+        super().__init__(model, config, adapter_name)
+
+        traininable_mode_counter = 0
+        for config in self.peft_config.values():
+            if not config.inference_mode:
+                traininable_mode_counter += 1
+
+        if traininable_mode_counter > 1:
+            raise ValueError(
+                "AdaLoraModel supports only 1 trainable adapter. "
+                "When using multiple adapters, set inference_mode to True for all adapters except the one you want to train."
+            )
+
+        if self.peft_config[adapter_name].inference_mode:
+            _freeze_adapter(self.model, adapter_name)
+        else:
+            self.trainable_adapter_name = adapter_name
+            self.rankallocator = RankAllocator(self.model, self.peft_config[adapter_name], self.trainable_adapter_name)
+
+    def _check_new_adapter_config(self, config: LoraConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        super()._check_new_adapter_config(config)
+
+        traininable_mode_counter = 0
+        for config_ in self.peft_config.values():
+            if not config_.inference_mode:
+                traininable_mode_counter += 1
+
+        if traininable_mode_counter > 1:
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 trainable adapter. "
+                "When using multiple adapters, set inference_mode to True for all adapters except the one "
+                "you want to train."
+            )
+
+    def _create_and_replace(
+        self,
+        lora_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        current_key,
+    ):
+        kwargs = {
+            "r": lora_config.init_r,
+            "lora_alpha": lora_config.lora_alpha,
+            "lora_dropout": lora_config.lora_dropout,
+            "fan_in_fan_out": lora_config.fan_in_fan_out,
+            "init_lora_weights": lora_config.init_lora_weights,
+            "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
+            "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
+        }
+        if (kwargs["loaded_in_8bit"] or kwargs["loaded_in_4bit"]) and not is_bnb_available():
+            raise ImportError(
+                "To use AdaLora with 8-bit quantization, please install the `bitsandbytes` package. "
+                "You can install it with `pip install bitsandbytes`."
+            )
+
+        quantization_config = get_quantization_config(self.model, method="gptq")
+        if quantization_config is not None:
+            kwargs["gptq_quantization_config"] = quantization_config
+
+        # If it is not an AdaLoraLayer, create a new module, else update it with new adapters
+        if not isinstance(target, AdaLoraLayer):
+            new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
+            if adapter_name != self.active_adapter:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+        else:
+            target.update_layer(
+                adapter_name,
+                lora_config.init_r,
+                lora_config.lora_alpha,
+                lora_config.lora_dropout,
+                lora_config.init_lora_weights,
+            )
+
+    @staticmethod
+    def _create_new_module(lora_config, adapter_name, target, **kwargs):
+        # avoid eager bnb import
+        if is_bnb_available():
+            import bitsandbytes as bnb
+
+            from .bnb import SVDLinear8bitLt
+        if is_bnb_4bit_available():
+            from .bnb import SVDLinear4bit
+
+        gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
+        AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
+
+        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
+        loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
+
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
+            kwargs.update(
+                {
+                    "has_fp16_weights": target_base_layer.state.has_fp16_weights,
+                    "memory_efficient_backward": target_base_layer.state.memory_efficient_backward,
+                    "threshold": target_base_layer.state.threshold,
+                    "index": target_base_layer.index,
+                }
+            )
+            new_module = SVDLinear8bitLt(target, adapter_name, **kwargs)
+        elif loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit):
+            fourbit_kwargs = kwargs.copy()
+            fourbit_kwargs.update(
+                {
+                    "compute_dtype": target_base_layer.compute_dtype,
+                    "compress_statistics": target_base_layer.weight.compress_statistics,
+                    "quant_type": target_base_layer.weight.quant_type,
+                }
+            )
+            new_module = SVDLinear4bit(target, adapter_name, **fourbit_kwargs)
+        elif AutoGPTQQuantLinear is not None and isinstance(target, AutoGPTQQuantLinear):
+            new_module = SVDQuantLinear(target, adapter_name, **kwargs)
+        else:
+            if isinstance(target_base_layer, torch.nn.Linear):
+                if kwargs["fan_in_fan_out"]:
+                    warnings.warn(
+                        "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                        "Setting fan_in_fan_out to False."
+                    )
+                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+            elif isinstance(target_base_layer, Conv1D):
+                if not kwargs["fan_in_fan_out"]:
+                    warnings.warn(
+                        "fan_in_fan_out is set to False but the target module is `Conv1D`. "
+                        "Setting fan_in_fan_out to True."
+                    )
+                    kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
+            else:
+                raise ValueError(
+                    f"Target module {target} is not supported. "
+                    f"Currently, only `torch.nn.Linear` and `Conv1D` are supported."
+                )
+            new_module = SVDLinear(target, adapter_name, **kwargs)
+
+        return new_module
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING[
+                model_config["model_type"]
+            ]
+        return peft_config
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def forward(self, *args, **kwargs):
+        outputs = self.model.forward(*args, **kwargs)
+
+        if (getattr(outputs, "loss", None) is not None) and isinstance(outputs.loss, torch.Tensor):
+            # Calculate the orthogonal regularization
+            orth_reg_weight = self.peft_config[self.trainable_adapter_name].orth_reg_weight
+
+            if orth_reg_weight <= 0:
+                raise ValueError("orth_reg_weight should be greater than 0. ")
+
+            regu_loss = 0
+            num_param = 0
+            for n, p in self.model.named_parameters():
+                if ("lora_A" in n or "lora_B" in n) and self.trainable_adapter_name in n:
+                    para_cov = p @ p.T if "lora_A" in n else p.T @ p
+                    I = torch.eye(*para_cov.size(), out=torch.empty_like(para_cov))  # noqa: E741
+                    I.requires_grad = False
+                    num_param += 1
+                    regu_loss += torch.norm(para_cov - I, p="fro")
+            if num_param > 0:
+                regu_loss = regu_loss / num_param
+            else:
+                regu_loss = 0
+            outputs.loss += orth_reg_weight * regu_loss
+        return outputs
+
+    def resize_modules_by_rank_pattern(self, rank_pattern, adapter_name):
+        lora_config = self.peft_config[adapter_name]
+        for name, rank_idx in rank_pattern.items():
+            if isinstance(rank_idx, list):
+                rank = sum(rank_idx)
+            elif isinstance(rank_idx, torch.Tensor):
+                rank_idx = rank_idx.view(-1)
+                rank = rank_idx.sum().item()
+            else:
+                raise ValueError("Unexpected type of rank_idx")
+            key = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1])
+            _, target, _ = _get_submodules(self.model, key)
+            lora_E_weights = target.lora_E[adapter_name][rank_idx]
+            lora_A_weights = target.lora_A[adapter_name][rank_idx]
+            lora_B_weights = target.lora_B[adapter_name][:, rank_idx]
+            ranknum = target.ranknum[adapter_name]
+            target.update_layer(
+                adapter_name,
+                rank,
+                lora_config.lora_alpha,
+                lora_config.lora_dropout,
+                lora_config.init_lora_weights,
+            )
+            with torch.no_grad():
+                if rank > 0:
+                    target.lora_E[adapter_name].copy_(lora_E_weights)
+                    target.lora_A[adapter_name].copy_(lora_A_weights)
+                    target.lora_B[adapter_name].copy_(lora_B_weights)
+                    # The scaling is exactly as the previous
+                    target.ranknum[adapter_name].copy_(ranknum)
+
+    def resize_state_dict_by_rank_pattern(self, rank_pattern, state_dict, adapter_name):
+        for name, rank_idx in rank_pattern.items():
+            rank = sum(rank_idx)
+            prefix = ".".join(name.split(".")[0:-2]) if adapter_name in name else ".".join(name.split(".")[0:-1])
+            for layer in ["lora_E", "lora_A", "lora_B"]:
+                key = f"base_model.model.{prefix}.{layer}.{adapter_name}"
+                if layer != "lora_B":
+                    state_dict[key] = (
+                        state_dict[key][rank_idx] if rank != state_dict[key].shape[0] else state_dict[key]
+                    )
+                else:
+                    state_dict[key] = (
+                        state_dict[key][:, rank_idx] if rank != state_dict[key].shape[1] else state_dict[key]
+                    )
+        return state_dict
+
+    def update_and_allocate(self, global_step):
+        """
+        This method updates Adalora budget and mask.
+
+        This should be called in every training step after `loss.backward()` and before `zero_grad()`.
+
+        `tinit`, `tfinal` and `deltaT` are handled with in the method.
+
+        Args:
+            global_step (`int`): The current training step, it is used to calculate adalora budget.
+
+        Example:
+
+        ```python
+        >>> loss = model(**input).loss
+        >>> loss.backward()
+        >>> optimizer.step()
+        >>> model.base_model.update_and_allocate(i_step)
+        >>> optimizer.zero_grad()
+        ```
+        """
+        lora_config = self.peft_config[self.trainable_adapter_name]
+        # Update the importance score and allocate the budget
+        if global_step < lora_config.total_step - lora_config.tfinal:
+            _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step)
+            if rank_pattern:
+                lora_config.rank_pattern = rank_pattern
+        # Finalize the budget allocation
+        elif global_step == lora_config.total_step - lora_config.tfinal:
+            _, rank_pattern = self.rankallocator.update_and_allocate(self.model, global_step, force_mask=True)
+            # for some reason, this freezes the trainable parameters and nothing gets updates
+            # self.resize_modules_by_rank_pattern(rank_pattern, self.trainable_adapter_name)
+            lora_config.rank_pattern = rank_pattern
+            self.rankallocator.reset_ipt()
+        # Currently using inefficient way to mask the unimportant weights using the rank pattern
+        #  due to problem mentioned above
+        elif global_step > lora_config.total_step - lora_config.tfinal:
+            self.rankallocator.mask_using_rank_pattern(self.model, lora_config.rank_pattern)
+        # Pass the function and do forward propagation
+        else:
+            return None
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__init__.py b/MoRA/peft_mora/tuners/adaption_prompt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ede9455f70e41740768abe80f3198e78397053f
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adaption_prompt/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .config import AdaptionPromptConfig
+from .layer import AdaptedAttention
+from .model import AdaptionPromptModel
+
+
+__all__ = ["AdaptionPromptConfig", "AdaptedAttention", "AdaptionPromptModel"]
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..226e57f11a8915235c8cc10dec920aefed216928
Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..751917461c74e90128a773e67ae73e2f2b18de80
Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/layer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24f787eabc2e23919d0735b5a9f7879b92be0017
Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/layer.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a56e027dc4312fa1e58b3ef41dcbfa33f747cee
Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/utils.cpython-312.pyc b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53d7ef6c90c4cb39801d9b3890695821f4d6f407
Binary files /dev/null and b/MoRA/peft_mora/tuners/adaption_prompt/__pycache__/utils.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/config.py b/MoRA/peft_mora/tuners/adaption_prompt/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6aad4c91be7db00980521f6c30a9bd358da038c
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adaption_prompt/config.py
@@ -0,0 +1,73 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+from dataclasses import dataclass, field
+
+from peft_mora.config import PeftConfig
+from peft_mora.utils import PeftType
+
+from .utils import llama_compute_query_states
+
+
+@dataclass
+class AdaptionPromptConfig(PeftConfig):
+    """Stores the configuration of an [`AdaptionPromptModel`]."""
+
+    target_modules: str = field(
+        default=None, metadata={"help": "Name of the attention submodules to insert adaption prompts into."}
+    )
+    adapter_len: int = field(default=None, metadata={"help": "Number of adapter tokens to insert"})
+    adapter_layers: int = field(default=None, metadata={"help": "Number of adapter layers (from the top)"})
+
+    def __post_init__(self):
+        self.peft_type = PeftType.ADAPTION_PROMPT
+
+    @property
+    def is_adaption_prompt(self) -> bool:
+        """Return True if this is an adaption prompt config."""
+        return True
+
+
+# Contains the config that is specific to a transformers model type.
+ModelTypeConfig = namedtuple(
+    "ModelTypeConfig", ["compute_query_states", "target_modules", "k_proj_layer", "v_proj_layer", "o_proj_layer"]
+)
+
+# Mapping of transformers model types to their specific configuration.
+TRANSFORMERS_MODEL_CONFIG = {
+    "llama": ModelTypeConfig(
+        compute_query_states=llama_compute_query_states,
+        target_modules="self_attn",
+        k_proj_layer="k_proj",
+        v_proj_layer="v_proj",
+        o_proj_layer="o_proj",
+    ),
+}
+
+
+def prepare_config(
+    peft_config: AdaptionPromptConfig,
+    model,
+) -> AdaptionPromptConfig:
+    """Prepare the config based on the llama model type."""
+    if model.config.model_type not in TRANSFORMERS_MODEL_CONFIG:
+        raise ValueError("Unsupported model type for adaption prompt: '{model.config.model_type}'.")
+
+    model_config = TRANSFORMERS_MODEL_CONFIG[model.config.model_type]
+
+    if peft_config.target_modules is None:
+        peft_config.target_modules = model_config.target_modules
+
+    return peft_config
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/layer.py b/MoRA/peft_mora/tuners/adaption_prompt/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdd7895eaae63c2bd1ec180087395d3baf9769bd
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adaption_prompt/layer.py
@@ -0,0 +1,120 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .config import TRANSFORMERS_MODEL_CONFIG
+
+
+class AdaptedAttention(nn.Module):
+    """This module wraps a LLamaAttention module and injects adaption prompts."""
+
+    def __init__(self, model_type: str, adapter_len: int, model):
+        """
+        Initialize object.
+
+        Args:
+            model_type: The transformer model type. This is used to retrieve the right method to
+                compute query states.
+            adapter_len: The length of the adaption prompt to insert.
+            model: The original transformer attention module that is being wrapped.
+        """
+        assert not isinstance(model, AdaptedAttention)
+        super().__init__()
+        self.model_type = model_type
+        self.model = model
+        self.adapter_len = adapter_len
+        # Assume all parameters of the attention model we are wrapping are on the same device.
+        device = next(model.parameters()).device
+        # Don't think this was specified in the paper, but we follow the official repo which used an Embedding
+        # which initializes the tokens with standard normal values.
+        # https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L234
+        # (bsz, adapter_len, hidden_size)
+        target_dtype = (
+            model.q_proj.weight.dtype if model.q_proj.weight.dtype not in [torch.int8, torch.uint8] else torch.float32
+        )
+        self.adaption_prompt = nn.Parameter(
+            torch.empty(1, adapter_len, self.model.hidden_size, device=device, dtype=target_dtype).normal_()
+        )
+        # Initialize the gate to 0 as this is "zero-init".
+        self.adaption_gate = nn.Parameter(torch.zeros(1, device=device, dtype=target_dtype))
+
+    def forward(self, **kwargs):
+        """
+        Forward pass for the adapter which wraps the original LlamaAttention module.
+
+        "Official" paper implementation:
+        https://github.com/ZrrSkywalker/LLaMA-Adapter/blob/41c3546fe1997ab8a65809dc8d8f9252b19d9faf/llama/model.py#L141
+
+        Args:
+            kwargs: See the original LlamaAttention module.
+        """
+        if kwargs.get("output_attention", False):
+            raise NotImplementedError("output_attention is not currently supported.")
+
+        output, _, past_key_value = self.model(**kwargs)
+        bsz = output.shape[0]
+        q_len = output.shape[1]
+        embed_dim = output.shape[2]
+        k_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].k_proj_layer
+        v_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].v_proj_layer
+        o_proj_layer = TRANSFORMERS_MODEL_CONFIG[self.model_type].o_proj_layer
+
+        if k_proj_layer == v_proj_layer:
+            _, key, value = getattr(self.model, k_proj_layer)(self.adaption_prompt).split(embed_dim, dim=2)
+        else:
+            key = getattr(self.model, k_proj_layer)(self.adaption_prompt)
+            value = getattr(self.model, v_proj_layer)(self.adaption_prompt)
+        # (bsz, num_heads, adapter_len, head_dim)
+        adapter_k = (
+            key.view(1, self.adapter_len, self.model.num_heads, self.model.head_dim)
+            .repeat(bsz, 1, 1, 1)
+            .transpose(1, 2)
+        )
+        # (bsz, num_heads, adapter_len, head_dim)
+        adapter_v = (
+            value.view(1, self.adapter_len, self.model.num_heads, self.model.head_dim)
+            .repeat(bsz, 1, 1, 1)
+            .transpose(1, 2)
+        )
+
+        # Recompute query states.
+        compute_query_states = TRANSFORMERS_MODEL_CONFIG[self.model_type].compute_query_states
+        # (bsz, num_heads, q_len, head_dim)
+        query_states = compute_query_states(model=self.model, **kwargs)
+
+        previous_dtype = query_states.dtype
+        # (bsz, num_heads, q_len, adapter_len)
+        scores = torch.matmul(query_states, adapter_k.transpose(2, 3).to(previous_dtype)) / math.sqrt(
+            self.model.head_dim
+        )
+        # Upcast attention to fp32
+        # (bsz, num_heads, q_len, adapter_len)
+        scores = self.adaption_gate * F.softmax(scores, dim=-1, dtype=torch.float32).to(previous_dtype)
+        # (bsz, q_len, num_heads * head_dim)
+        adapter_output = torch.matmul(scores, adapter_v).transpose(1, 2).reshape(bsz, q_len, -1)
+        # (bsz, q_len, hidden_size)
+        if o_proj_layer is not None:
+            adapter_output = getattr(self.model, o_proj_layer)(adapter_output)
+
+        # Add adaption prompt output to original output.
+        output = output + adapter_output
+
+        # Restore original dtype.
+        output = output.to(previous_dtype)
+        return output, None, past_key_value
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/model.py b/MoRA/peft_mora/tuners/adaption_prompt/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b39058be486c1814f5424168db6b04e95ead0fa
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adaption_prompt/model.py
@@ -0,0 +1,161 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List
+
+import torch.nn as nn
+
+from peft_mora.utils import _freeze_adapter, _get_submodules
+
+from .config import AdaptionPromptConfig, prepare_config
+from .layer import AdaptedAttention
+from .utils import is_adaption_prompt_trainable
+
+
+class AdaptionPromptModel(nn.Module):
+    """
+    Implements adaption prompts as described in https://arxiv.org/pdf/2303.16199.pdf.
+
+    The top L attention modules are replaced with AdaptedAttention modules that wrap the original ones, but insert
+    trainable prompts with gates (for zero init).
+
+    Notes on the multi-adapter pattern:
+    - We store the states of different adapters by keeping a dictionary of AdaptedAttention modules indexed by adapter
+      name.
+    - Every time we switch adapters, we remove the modules of the currently active adapter from the model, store them
+      in the dictionary, and replace them with the modules of the new adapter.
+    - To avoid duplicated and potentially inconsistent state, the currently active adapter is always removed from the
+      dictionary.
+    - Disabling the adapter would also result in the modules being removed from the model.
+    """
+
+    def __init__(self, model, configs: Dict, adapter_name: str):
+        super().__init__()
+        self.model = model
+        # Store adapter configs by name.
+        self.peft_config: Dict[str, AdaptionPromptConfig] = {}
+        # Store lists of the parents of the affected attention modules by adapter name.
+        # We keep references to the parents so we can swap the adapters in-and-out of the model.
+        self._parents: Dict[str, List[nn.Module]] = {}
+        # Store lists of cached AdaptedAttention modules by name.
+        self._cached_adapters: Dict[str, List] = {}
+        # The name of the currently active adapter.
+        self._active_adapter = None
+        # Whether the adapter is enabled.
+        self._enabled = True
+        self.forward = self.model.forward
+        self.add_adapter(adapter_name, configs[adapter_name])
+        self._mark_only_adaption_prompts_as_trainable(self.model)
+
+    def add_adapter(self, adapter_name: str, config: AdaptionPromptConfig) -> None:
+        """Add an adapter with the given name and config."""
+        config = prepare_config(config, self.model)
+        if adapter_name in self.peft_config:
+            raise ValueError(f"Adapter with name '{adapter_name}' already exists.")
+
+        parents = []
+        for name, _ in self.model.named_modules():
+            if name.endswith(config.target_modules):
+                par, _, _ = _get_submodules(self.model, name)
+                parents.append(par)
+        if len(parents) < config.adapter_layers:
+            raise ValueError(
+                f"Config specifies more adapter layers '{config.adapter_layers}'"
+                f" than the model has '{len(parents)}'."
+            )
+        # Note that if the target modules are not in Sequential, ModuleList, or
+        # some other PyTorch ordered container, the behavior is undefined as we
+        # assume here that the order of the modules is the same as the order of
+        # the transformer decoder layers.
+        parents = parents[-config.adapter_layers :]
+        self._parents[adapter_name] = parents
+
+        # It is only None during initialization.
+        # If it is disabled, we don't have to remove the modules.
+        if self._active_adapter is not None and self._enabled:
+            self._remove_adapted_attentions(self._active_adapter)
+        self._active_adapter = adapter_name
+        self.peft_config[adapter_name] = config
+        self._create_adapted_attentions(config, parents)
+        if not self._enabled:
+            self._remove_adapted_attentions(self._active_adapter)
+
+        if config.inference_mode:
+            _freeze_adapter(self.model, adapter_name)
+
+    def set_adapter(self, adapter_name: str) -> None:
+        """Set the model to use the adapter with the given name."""
+        if self._active_adapter == adapter_name:
+            return
+        if adapter_name not in self.peft_config:
+            raise ValueError(f"Adapter with name '{adapter_name}' does not exist.")
+
+        if self._enabled:
+            self._remove_adapted_attentions(self._active_adapter)
+            self._set_adapted_attentions(adapter_name)
+
+        self._active_adapter = adapter_name
+
+    def enable_adapter_layers(self):
+        """Enable adapter layers by swapping in cached AdaptedAttention modules."""
+        self._enabled = True
+        self._set_adapted_attentions(self._active_adapter)
+
+    def disable_adapter_layers(self):
+        """Disable adapter layers by swapping out AdaptedAttention modules."""
+        self._enabled = False
+        self._remove_adapted_attentions(self._active_adapter)
+
+    def _create_adapted_attentions(self, config: AdaptionPromptConfig, parents: List[nn.Module]) -> None:
+        """Wrap LlamaAttention modules with newly created AdaptedAttention modules."""
+        for par in parents:
+            attn = AdaptedAttention(
+                model_type=self.model.config.model_type,
+                adapter_len=config.adapter_len,
+                model=getattr(par, config.target_modules),
+            )
+            setattr(par, config.target_modules, attn)
+
+    def _set_adapted_attentions(self, adapter_name: str) -> None:
+        """Replace LlamaAttention modules with cached AdaptedAttention modules."""
+        cached = self._cached_adapters[adapter_name]
+        del self._cached_adapters[adapter_name]
+        config = self.peft_config[adapter_name]
+        for i, par in enumerate(self._parents[adapter_name]):
+            setattr(par, config.target_modules, cached[i])
+
+    def _remove_adapted_attentions(self, adapter_name: str) -> None:
+        """Remove AdaptedAttention modules from the model and store them in the cache."""
+        config = self.peft_config[adapter_name]
+        adapted_attentions = []
+        for par in self._parents[adapter_name]:
+            attn = getattr(par, config.target_modules)
+            adapted_attentions.append(attn)
+            setattr(par, config.target_modules, attn.model)
+        self._cached_adapters[adapter_name] = adapted_attentions
+
+    def _mark_only_adaption_prompts_as_trainable(self, model: nn.Module) -> None:
+        """Freeze all parameters of the model except the adaption prompts."""
+        for n, p in model.named_parameters():
+            if not is_adaption_prompt_trainable(n):
+                p.requires_grad = False
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            # This is necessary as e.g. causal models have various methods that we
+            # don't want to re-implement here.
+            return getattr(self.model, name)
diff --git a/MoRA/peft_mora/tuners/adaption_prompt/utils.py b/MoRA/peft_mora/tuners/adaption_prompt/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f33e5584009a04f7d73f5f675d3395f09da215
--- /dev/null
+++ b/MoRA/peft_mora/tuners/adaption_prompt/utils.py
@@ -0,0 +1,111 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+
+import torch
+import torch.nn as nn
+
+
+def llama_rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotate half the hidden dims of the input.
+
+    This function was duplicated verbatim from:
+    https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L126
+
+    This was done to eliminate the Llama transformers implementation as a dependency of this file. Note that some other
+    functions were also adapted from the transformers implementation but were modified.
+    """
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def llama_apply_rotary_pos_emb(q, cos, sin, position_ids):
+    """
+    Apply rotary position embedding to query states in the Llama model.
+
+    This function was adapted from:
+    https://github.com/huggingface/transformers/blob/1de8ce9ee1191ba761a593ac15d9ccbf5851bfc5/src/transformers/models/llama/modeling_llama.py#L133
+
+    It was modified to remove unnecessary processing of key states. The method is compatible with transformers <=
+    4.34.2 and also with the latest version (>=4.35).
+    """
+    # In previous transformers version cos/sin cached had a shape of 4D
+    if len(cos.shape) == 4:
+        gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+        gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+        cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+        sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    # In the new version, it is 2D so we fall back to the new implementation
+    # https://github.com/huggingface/transformers/blame/eef7ea98c31a333bacdc7ae7a2372bde772be8e4/src/transformers/models/llama/modeling_llama.py#L222-L226
+    else:
+        cos = cos[position_ids].unsqueeze(1)
+        sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (llama_rotate_half(q) * sin)
+    return q_embed
+
+
+def llama_compute_query_states(model: nn.Module, **kwargs) -> torch.Tensor:
+    """
+    Compute query states for Llama models specifically. They need to be recomputed as the forward() method of the
+    original LlamaModel in the transformers library does not return them. See the related discussion in the PR:
+    https://github.com/huggingface/peft/pull/268
+    """
+    hidden_states = kwargs.get("hidden_states")
+    position_ids = kwargs.get("position_ids")
+    past_key_value = kwargs.get("past_key_value")
+    bsz, q_len, _ = hidden_states.size()
+    query_states = model.q_proj(hidden_states).view(bsz, q_len, model.num_heads, model.head_dim).transpose(1, 2)
+    value_states = model.v_proj(hidden_states).view(bsz, q_len, model.num_heads, model.head_dim).transpose(1, 2)
+    seq_len = q_len
+
+    if past_key_value is not None:
+        if isinstance(past_key_value, tuple):
+            # for transformers <= 4.35
+            seq_len += past_key_value[0].shape[-2]
+        else:
+            # since transformers 4.36, this is a DynamicCache instance
+            seq_len += past_key_value.get_seq_length(model.layer_idx)
+
+    # For transformers > 4.37.2 `position_ids` became a required arguments in the rotary embedding's forward pass.
+    if "position_ids" not in inspect.signature(model.rotary_emb.forward).parameters:
+        # TODO we assume that position_ids is not None here, not sure if that is safe but the old code also did that
+        cos, sin = model.rotary_emb(value_states, seq_len=seq_len)
+        return llama_apply_rotary_pos_emb(query_states, cos, sin, position_ids)
+
+    past_seen_tokens = 0
+    if position_ids is None:
+        # Compute position_ids, since they are required for transformers > 4.37.2
+        if past_key_value is None:
+            new_cache_positions = torch.arange(q_len, q_len + q_len, device=value_states.device)
+        else:
+            past_seen_tokens = past_key_value.get_usable_length(q_len, model.layer_idx)
+            new_cache_positions = torch.arange(past_seen_tokens, past_seen_tokens + q_len, device=value_states.device)
+        position_ids = new_cache_positions.unsqueeze(0)
+
+    cos, sin = model.rotary_emb(value_states, seq_len=q_len + past_seen_tokens, position_ids=position_ids)
+
+    # For batched inference unsqueeze it on the correct dim
+    # since: https://github.com/huggingface/transformers/pull/29109
+    if len(cos.shape) == 3:
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+
+    return (query_states * cos) + (llama_rotate_half(query_states) * sin)
+
+
+def is_adaption_prompt_trainable(params: str) -> bool:
+    """Return True if module is trainable under adaption prompt fine-tuning."""
+    return params.split(".")[-1].startswith("adaption_")
diff --git a/MoRA/peft_mora/tuners/ia3/__init__.py b/MoRA/peft_mora/tuners/ia3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d54cd89cc1d123eb321f5c6a13f382afe70e4b55
--- /dev/null
+++ b/MoRA/peft_mora/tuners/ia3/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .config import IA3Config
+from .layer import Conv2d, IA3Layer, Linear
+from .model import IA3Model
+
+
+__all__ = ["Conv2d", "IA3Config", "IA3Layer", "IA3Model", "Linear"]
+
+
+def __getattr__(name):
+    if (name == "Linear8bitLt") and is_bnb_available():
+        from .bnb import Linear8bitLt
+
+        return Linear8bitLt
+
+    if (name == "Linear4bit") and is_bnb_4bit_available():
+        from .bnb import Linear4bit
+
+        return Linear4bit
+
+    raise AttributeError(f"module {__name__} has no attribute {name}")
diff --git a/MoRA/peft_mora/tuners/ia3/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/ia3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b7dd99ab2f5f6f5b1f5f7911c7dddd540ec5d61
Binary files /dev/null and b/MoRA/peft_mora/tuners/ia3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/ia3/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/ia3/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d8f1be8c02054ab6c6f2643ac7a0c6c2cf7190e
Binary files /dev/null and b/MoRA/peft_mora/tuners/ia3/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/ia3/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/ia3/__pycache__/layer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45dda2966e23bce643b13ded365e7a1db9a84f88
Binary files /dev/null and b/MoRA/peft_mora/tuners/ia3/__pycache__/layer.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/ia3/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/ia3/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d7bffa489a033e6604d1b8df44d1e5d22a160f6
Binary files /dev/null and b/MoRA/peft_mora/tuners/ia3/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/ia3/bnb.py b/MoRA/peft_mora/tuners/ia3/bnb.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d033191a27081bc7cd34601f62f53ab8eca2778
--- /dev/null
+++ b/MoRA/peft_mora/tuners/ia3/bnb.py
@@ -0,0 +1,129 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import torch
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .layer import IA3Layer
+
+
+if is_bnb_available():
+
+    class Linear8bitLt(torch.nn.Module, IA3Layer):
+        # (IA)^3 implemented in a dense layer
+        def __init__(
+            self,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            is_feedforward: bool,
+            init_ia3_weights: bool = True,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
+
+            # Freezing the pre-trained weight matrix
+            self.get_base_layer().weight.requires_grad = False
+            self._active_adapter = adapter_name
+            self.update_layer(adapter_name, init_ia3_weights)
+
+        def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+            # note: no check for self.merged because merging is not supported (yet)
+            if self.disable_adapters:
+                return self.base_layer(x)
+
+            ia3_scaling = 1
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.ia3_l.keys():
+                    continue
+                ia3_scaling *= self.ia3_l[active_adapter].flatten()
+
+            requires_conversion = (not torch.is_autocast_enabled()) and (x.dtype != torch.float32)
+            if requires_conversion:
+                x = x.float()
+            if self.is_feedforward:
+                result = self.base_layer(x * ia3_scaling)
+                expected_dtype = result.dtype
+            else:
+                result = self.base_layer(x)
+                expected_dtype = result.dtype
+                result = result * ia3_scaling
+
+            if requires_conversion:
+                result = result.to(expected_dtype)
+
+            return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "ia3." + rep
+
+
+if is_bnb_4bit_available():
+
+    class Linear4bit(torch.nn.Module, IA3Layer):
+        # IA3 implemented in a dense layer
+        def __init__(
+            self,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            is_feedforward: bool,
+            init_ia3_weights: bool = True,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
+
+            # Freezing the pre-trained weight matrix
+            self.get_base_layer().weight.requires_grad = False
+            self._active_adapter = adapter_name
+            self.update_layer(adapter_name, init_ia3_weights)
+
+        def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+            # note: no check for self.merged because merging is not supported (yet)
+            if self.disable_adapters:
+                return self.base_layer(x)
+
+            ia3_scaling = 1
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.ia3_l.keys():
+                    continue
+                ia3_scaling *= self.ia3_l[active_adapter].flatten()
+
+            requires_conversion = (not torch.is_autocast_enabled()) and (x.dtype != torch.float32)
+            if requires_conversion:
+                x = x.float()
+            if self.is_feedforward:
+                result = self.base_layer(x * ia3_scaling)
+                expected_dtype = result.dtype
+            else:
+                result = self.base_layer(x)
+                expected_dtype = result.dtype
+                result = result * ia3_scaling
+
+            result = result.clone()
+            # adalora.py and lora.py both suggest that this is necessary for 4-bit training on older versions of Pytorch.
+            # This has been duplicated here.
+
+            if requires_conversion:
+                result = result.to(expected_dtype)
+
+            return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "ia3." + rep
diff --git a/MoRA/peft_mora/tuners/ia3/config.py b/MoRA/peft_mora/tuners/ia3/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e52286c05a0eb4befb808c51a7475b714af7847
--- /dev/null
+++ b/MoRA/peft_mora/tuners/ia3/config.py
@@ -0,0 +1,98 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft_mora.config import PeftConfig
+from peft_mora.utils import PeftType
+
+
+@dataclass
+class IA3Config(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a [`IA3Model`].
+
+    Args:
+        target_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
+            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
+            strings, either an exact match will be performed or it is checked if the name of the module ends with any
+            of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen,
+            excluding the output layer. If this is not specified, modules will be chosen according to the model
+            architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
+            the target modules manually.
+        feedforward_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to be treated as feedforward modules, as in the original paper. These modules will
+            have (IA)³ vectors multiplied to the input, instead of the output. `feedforward_modules` must be a name or
+            a subset of names present in `target_modules`.
+        fan_in_fan_out (`bool`):
+            Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
+            `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
+        modules_to_save (`Optional[List[str]]`):
+            List of modules apart from (IA)³ layers to be set as trainable and saved in the final checkpoint.
+        init_ia3_weights (`bool`):
+            Whether to initialize the vectors in the (IA)³ layers, defaults to `True`. Setting this to `False` is
+            discouraged.
+    """
+
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "List of module names or regex expression of the module names to replace with (IA)³."
+                "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'."
+                "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
+                "If not specified, modules will be chosen according to the model architecture, If the architecture is "
+                "not known, an error will be raised -- in this case, you should specify the target modules manually."
+            ),
+        },
+    )
+    feedforward_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or a regex expression of module names which are feedforward"
+            "For example, ['output.dense']"
+        },
+    )
+    fan_in_fan_out: bool = field(
+        default=False,
+        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from (IA)^3 layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+    init_ia3_weights: bool = field(
+        default=True,
+        metadata={"help": "Whether to initialize the vectors in the (IA)^3 layers."},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.IA3
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
+        self.feedforward_modules = (
+            set(self.feedforward_modules) if isinstance(self.feedforward_modules, list) else self.feedforward_modules
+        )
+
+        # check if feedforward_modules is a subset of target_modules. run the check only if both are sets
+        if isinstance(self.feedforward_modules, set) and isinstance(self.target_modules, set):
+            if not self.feedforward_modules.issubset(self.target_modules):
+                raise ValueError("`feedforward_modules` should be a subset of `target_modules`")
diff --git a/MoRA/peft_mora/tuners/ia3/layer.py b/MoRA/peft_mora/tuners/ia3/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3668460768bcc703e0f597030450125336c475e6
--- /dev/null
+++ b/MoRA/peft_mora/tuners/ia3/layer.py
@@ -0,0 +1,307 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Any, List, Optional
+
+import torch
+import torch.nn as nn
+from transformers.pytorch_utils import Conv1D
+
+from peft_mora.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+from peft_mora.utils import transpose
+
+
+class IA3Layer(BaseTunerLayer):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("ia3_l",)
+
+    def __init__(self, base_layer: nn.Module, is_feedforward: bool, **kwargs) -> None:
+        self.base_layer = base_layer
+        self.ia3_l = nn.ParameterDict({})
+        # Mark the weight as unmerged
+        self._disable_adapters = False
+        self.merged_adapters = []
+        self.is_feedforward = is_feedforward
+
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        elif isinstance(base_layer, nn.Conv2d):
+            in_features, out_features = base_layer.in_channels, base_layer.out_channels
+        elif isinstance(base_layer, nn.Embedding):
+            in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim
+        elif isinstance(base_layer, Conv1D):
+            in_features, out_features = (
+                base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
+            )
+        else:
+            raise ValueError(f"Unsupported layer type {type(base_layer)}")
+        self.in_features = in_features
+        self.out_features = out_features
+
+    def update_layer(self, adapter_name, init_ia3_weights):
+        # This code works for linear layers, override for other layer types
+        # Actual trainable parameters
+        if self.is_feedforward:
+            weight = torch.randn((1, self.in_features))
+        else:
+            weight = torch.randn((self.out_features, 1))
+        self.ia3_l[adapter_name] = nn.Parameter(weight)
+        if init_ia3_weights:
+            self.reset_ia3_parameters(adapter_name)
+        self.to(self.get_base_layer().weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def reset_ia3_parameters(self, adapter_name):
+        if adapter_name in self.ia3_l.keys():
+            # initialize learned vector with torch.ones
+            nn.init.constant_(self.ia3_l[adapter_name], 1.0)
+
+
+class Linear(nn.Module, IA3Layer):
+    # (IA)^3 implemented in a dense layer
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
+        is_target_conv_1d_layer: bool = False,  # whether target module is a conv1d layer. useful while unloading later
+        init_ia3_weights: bool = True,  # whether to initialize IA3 weights
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
+        self.fan_in_fan_out = fan_in_fan_out
+        self.is_target_conv_1d_layer = is_target_conv_1d_layer
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, init_ia3_weights)
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.ia3_l.keys():
+                base_layer = self.get_base_layer()
+                ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out)
+                if safe_merge:
+                    orig_weights = base_layer.weight.data
+                    orig_weights = torch.mul(orig_weights, ia3_l)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+                    base_layer.weight.data = orig_weights
+                else:
+                    base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_l)
+
+                if not self.is_feedforward and (base_layer.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
+                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data)
+
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+
+        warnings.warn("Unmerge result can be inaccurate for (IA)^3.")
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.ia3_l.keys():
+                base_layer = self.get_base_layer()
+                # Add tolerace to avoid division by zero
+                ia3_l = transpose(self.ia3_l[active_adapter].data, self.fan_in_fan_out) + 1e-8
+                base_layer.weight.data = torch.div(base_layer.weight.data, ia3_l)
+
+                if not self.is_feedforward and (base_layer.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
+                    base_layer.bias.data = torch.div(base_layer.bias.data, scaling.data + 1e-8)
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        dtype = previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            ia3_scaling = 1
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.ia3_l.keys():
+                    continue
+                dtype = self.ia3_l[active_adapter].dtype
+                ia3_scaling *= self.ia3_l[active_adapter].flatten()
+
+            if self.is_feedforward:
+                x = x.to(dtype)
+                # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype
+                # e.g. bf16 vs fp32. Is that okay?
+                interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype)
+                result = self.base_layer(interm, *args, **kwargs)
+            else:
+                result = self.base_layer(x, *args, **kwargs)
+                result = result.to(dtype) * ia3_scaling
+
+        result = result.to(previous_dtype)
+        return result
+
+
+class Conv2d(nn.Module, IA3Layer):
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        is_feedforward: bool = False,  # Set to True if the layer is treated as a feedforward layer
+        init_ia3_weights: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        IA3Layer.__init__(self, base_layer, is_feedforward=is_feedforward)
+        self.fan_in_fan_out = fan_in_fan_out
+        self._active_adapter = adapter_name
+
+        self.update_layer(adapter_name, init_ia3_weights)
+
+    def update_layer(self, adapter_name, init_ia3_weights):
+        # Actual trainable parameters
+        if self.is_feedforward:
+            weight = torch.randn((1, self.in_features, 1, 1))
+        else:
+            weight = torch.randn((1, self.out_features, 1, 1))
+        self.ia3_l[adapter_name] = nn.Parameter(weight)
+        if init_ia3_weights:
+            self.reset_ia3_parameters(adapter_name)
+        self.to(self.get_base_layer().weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.ia3_l.keys():
+                base_layer = self.get_base_layer()
+                ia3_scaling = self.ia3_l[active_adapter].data
+                if not self.is_feedforward:
+                    ia3_scaling = ia3_scaling.permute(1, 0, 2, 3)
+
+                if safe_merge:
+                    output_weight = torch.mul(base_layer.weight.data, ia3_scaling).clone()
+
+                    if not torch.isfinite(output_weight).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = output_weight
+                else:
+                    base_layer.weight.data = torch.mul(base_layer.weight.data, ia3_scaling)
+
+                if not self.is_feedforward and (base_layer.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
+                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data)
+
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+
+        warnings.warn("Unmerge result can be inaccurate for (IA)^3.")
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.ia3_l.keys():
+                base_layer = self.get_base_layer()
+                # divide by (IA)^3 vector. Add tolerace to avoid division by zero
+                ia3_scaling = self.ia3_l[active_adapter].data
+                if not self.is_feedforward:
+                    ia3_scaling = ia3_scaling.permute(1, 0, 2, 3)
+                base_layer.weight.data = torch.div(base_layer.weight.data, ia3_scaling + 1e-8)
+
+                if not self.is_feedforward and (base_layer.bias is not None):
+                    scaling = self.ia3_l[active_adapter].reshape(base_layer.bias.shape)
+                    base_layer.bias.data = torch.mul(base_layer.bias.data, scaling.data)
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        dtype = previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            ia3_scaling = 1
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.ia3_l.keys():
+                    continue
+                dtype = self.ia3_l[active_adapter].dtype
+                ia3_scaling *= self.ia3_l[active_adapter]
+
+            if self.is_feedforward:
+                x = x.to(dtype)
+                # TODO: weight.dtype can be != self.ia3_l[self.active_adapters].dtype
+                # e.g. bf16 vs fp32. Is that okay?
+                interm = (x * ia3_scaling).to(self.get_base_layer().weight.dtype)
+                result = self.base_layer(interm, *args, **kwargs)
+            else:
+                result = self.base_layer(x, *args, **kwargs)
+                result = result.to(dtype) * ia3_scaling
+
+        result = result.to(previous_dtype)
+        return result
diff --git a/MoRA/peft_mora/tuners/ia3/model.py b/MoRA/peft_mora/tuners/ia3/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5d46cf6d24f3fb92da8fe45c61d2395a36c51f6
--- /dev/null
+++ b/MoRA/peft_mora/tuners/ia3/model.py
@@ -0,0 +1,394 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import re
+import warnings
+from dataclasses import asdict
+from enum import Enum
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers.pytorch_utils import Conv1D
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft_mora.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft_mora.utils import (
+    TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    _get_submodules,
+)
+
+from .layer import Conv2d, IA3Layer, Linear
+
+
+class IA3Model(BaseTuner):
+    """
+    Creates a Infused Adapter by Inhibiting and Amplifying Inner Activations ((IA)^3) model from a pretrained
+    transformers model. The method is described in detail in https://arxiv.org/abs/2205.05638
+
+    Args:
+        model ([`~transformers.PreTrainedModel`]): The model to be adapted.
+        config ([`IA3Config`]): The configuration of the (IA)^3 model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The (IA)^3 model.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForSeq2SeqLM, ia3Config
+        >>> from peft import IA3Model, IA3Config
+
+        >>> config = IA3Config(
+        ...     peft_type="IA3",
+        ...     task_type="SEQ_2_SEQ_LM",
+        ...     target_modules=["k", "v", "w0"],
+        ...     feedforward_modules=["w0"],
+        ... )
+
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> ia3_model = IA3Model(config, model)
+        ```
+
+    **Attributes**:
+        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
+        - **peft_config** ([`ia3Config`]): The configuration of the (IA)^3 model.
+    """
+
+    prefix: str = "ia3_"
+
+    def __init__(self, model, config, adapter_name):
+        super().__init__(model, config, adapter_name)
+
+    @staticmethod
+    def _create_new_module(ia3_config, adapter_name, target, **kwargs):
+        # avoid eager bnb import
+        if is_bnb_available():
+            import bitsandbytes as bnb
+
+            from .bnb import Linear8bitLt
+
+        if is_bnb_4bit_available():
+            from .bnb import Linear4bit
+
+        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
+        loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
+        is_feedforward = kwargs.pop("is_feedforward", False)
+
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
+            eightbit_kwargs = kwargs.copy()
+            eightbit_kwargs.update(
+                {
+                    "has_fp16_weights": target_base_layer.state.has_fp16_weights,
+                    "memory_efficient_backward": target_base_layer.state.memory_efficient_backward,
+                    "threshold": target_base_layer.state.threshold,
+                    "index": target_base_layer.index,
+                }
+            )
+            new_module = Linear8bitLt(target, adapter_name, is_feedforward=is_feedforward, **eightbit_kwargs)
+        elif loaded_in_4bit and isinstance(target_base_layer, bnb.nn.Linear4bit):
+            fourbit_kwargs = kwargs.copy()
+            fourbit_kwargs.update(
+                {
+                    "compute_dtype": target_base_layer.compute_dtype,
+                    "compress_statistics": target_base_layer.weight.compress_statistics,
+                    "quant_type": target_base_layer.weight.quant_type,
+                }
+            )
+            new_module = Linear4bit(target, adapter_name, is_feedforward=is_feedforward, **fourbit_kwargs)
+        elif isinstance(target, torch.nn.Conv2d):
+            new_module = Conv2d(target, adapter_name, is_feedforward=is_feedforward, **kwargs)
+        elif isinstance(target_base_layer, torch.nn.Linear):
+            if kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                    "Setting fan_in_fan_out to False."
+                )
+                kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = False
+            new_module = Linear(target, adapter_name, is_feedforward=is_feedforward, **kwargs)
+        elif isinstance(target_base_layer, Conv1D):
+            if not kwargs["fan_in_fan_out"]:
+                warnings.warn(
+                    "fan_in_fan_out is set to False but the target module is `Conv1D`. "
+                    "Setting fan_in_fan_out to True."
+                )
+                kwargs["fan_in_fan_out"] = ia3_config.fan_in_fan_out = True
+            new_module = Linear(
+                target, adapter_name, is_feedforward=is_feedforward, is_target_conv_1d_layer=True, **kwargs
+            )
+        else:
+            raise ValueError(
+                f"Target module {target} is not supported. "
+                f"Currently, only `torch.nn.Linear`, `torch.nn.Conv2d`, and `Conv1D` are supported."
+            )
+        return new_module
+
+    @staticmethod
+    def _check_target_module_exists(ia3_config, key):
+        return check_target_module_exists(ia3_config, key)
+
+    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+    def _create_and_replace(
+        self,
+        ia3_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        current_key,
+    ):
+        # check if target module is in feedforward_modules
+        is_feedforward = self._check_target_module_feedforward(ia3_config, current_key)
+
+        kwargs = {
+            "fan_in_fan_out": ia3_config.fan_in_fan_out,
+            "init_ia3_weights": ia3_config.init_ia3_weights,
+            "is_feedforward": is_feedforward,
+            "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
+            "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
+        }
+
+        if isinstance(target, IA3Layer):
+            target.update_layer(
+                adapter_name,
+                ia3_config.init_ia3_weights,
+            )
+        else:
+            new_module = self._create_new_module(ia3_config, adapter_name, target, **kwargs)
+            if adapter_name != self.active_adapter:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+
+    @staticmethod
+    def _check_target_module_feedforward(ia3_config, key) -> bool:
+        """
+        A helper private method that checks if the target module `key` matches with a feedforward module specified in
+        `ia3_config`
+        """
+        if isinstance(ia3_config.feedforward_modules, str):
+            is_feedforward = bool(re.fullmatch(ia3_config.feedforward_modules, key))
+        else:
+            is_feedforward = any(key.endswith(target_key) for target_key in ia3_config.feedforward_modules)
+        return is_feedforward
+
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+
+        # layers with base_layer don't need the weight to be copied, as they have a reference already
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if self.prefix in name:
+                module.to(child.weight.device)
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (IA3Layer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self) -> None:
+        """Enable all adapters.
+
+        Call this if you have previously disabled all adapters and want to re-enable them.
+        """
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self) -> None:
+        """Disable all adapters.
+
+        When disabling all adapters, the model output corresponds to the output of the base model.
+        """
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name: str | list[str]) -> None:
+        """Set the active adapter(s).
+
+        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
+        not desired, use the following code.
+
+        ```py
+        >>> for name, param in model_peft.named_parameters():
+        ...     if ...:  # some check on name (ex. if 'lora' in name)
+        ...         param.requires_grad = False
+        ```
+
+        Args:
+            adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated.
+        """
+        for module in self.model.modules():
+            if isinstance(module, IA3Layer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+
+    def _prepare_adapter_config(self, peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING[model_config["model_type"]]
+        if peft_config.feedforward_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING:
+                raise ValueError("Please specify `feedforward_modules` in `peft_config`")
+            peft_config.feedforward_modules = TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING[
+                model_config["model_type"]
+            ]
+        return peft_config
+
+    def _unload_and_optionally_merge(
+        self, merge: bool = True, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
+    ):
+        r"""
+        This method merges the (IA)^3 layers into the base model. This is needed if someone wants to use the base model
+        as a standalone model.
+
+        Args:
+            safe_merge (`bool`, `optional`, defaults to `False`):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        if getattr(self.model, "is_loaded_in_8bit", False):
+            raise ValueError("Cannot merge ia3 layers when the model is loaded in 8-bit mode")
+
+        if getattr(self.model, "is_loaded_in_4bit", False):
+            raise ValueError("Cannot merge ia3 layers when the model is loaded in 4-bit mode")
+
+        self._unloading_checks(adapter_names)
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        for key in key_list:
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+
+            if hasattr(target, "base_layer"):
+                if merge:
+                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
+                new_module = target.modules_to_save[target.active_adapter]
+                if hasattr(new_module, "base_layer"):
+                    # check if the module is itself a tuner layer
+                    if merge:
+                        new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                    new_module = new_module.get_base_layer()
+                setattr(parent, target_name, new_module)
+
+        return self.model
+
+    def merge_and_unload(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> torch.nn.Module:
+        r"""
+        This method merges the IA³ layers into the base model. This is needed if someone wants to use the base model as
+        a standalone model.
+
+        Args:
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+
+        Example:
+
+        ```py
+        >>> from transformers import AutoModelForCausalLM
+        >>> from peft import PeftModel
+
+        >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
+        >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample"
+        >>> model = PeftModel.from_pretrained(base_model, peft_model_id)
+        >>> merged_model = model.merge_and_unload()
+        ```
+        """
+        return self._unload_and_optionally_merge(safe_merge=safe_merge, adapter_names=adapter_names)
+
+    def unload(self) -> torch.nn.Module:
+        """
+        Gets back the base model by removing all the IA³ modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
+
+    def delete_adapter(self, adapter_name: str) -> None:
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (str): Name of the adapter to be deleted.
+        """
+        if adapter_name not in self.peft_config:
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, IA3Layer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
diff --git a/MoRA/peft_mora/tuners/loha/__init__.py b/MoRA/peft_mora/tuners/loha/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f39deee17ab9cb0e24b3a98d8b54eb7eeb27c1f
--- /dev/null
+++ b/MoRA/peft_mora/tuners/loha/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import LoHaConfig
+from .layer import Conv2d, Linear, LoHaLayer
+from .model import LoHaModel
+
+
+__all__ = ["LoHaConfig", "LoHaModel", "Conv2d", "Linear", "LoHaLayer"]
diff --git a/MoRA/peft_mora/tuners/loha/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/loha/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a473c8c1816b811b15cc6c0d8c8ab231eddaa6ce
Binary files /dev/null and b/MoRA/peft_mora/tuners/loha/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/loha/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/loha/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9993e6eb22f188f86101fc36ca369c4b4b499a77
Binary files /dev/null and b/MoRA/peft_mora/tuners/loha/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/loha/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/loha/__pycache__/layer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1eb9cf066964c717c21b288f1098b70cc05fa104
Binary files /dev/null and b/MoRA/peft_mora/tuners/loha/__pycache__/layer.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/loha/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/loha/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45ead36837177dbcd203c78b239dfea8b123f060
Binary files /dev/null and b/MoRA/peft_mora/tuners/loha/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/loha/config.py b/MoRA/peft_mora/tuners/loha/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..9450d4735122a462f71e898705aa4ac1ab303fe9
--- /dev/null
+++ b/MoRA/peft_mora/tuners/loha/config.py
@@ -0,0 +1,121 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft_mora.tuners.lycoris_utils import LycorisConfig
+from peft_mora.utils import PeftType
+
+
+@dataclass
+class LoHaConfig(LycorisConfig):
+    """
+    This is the configuration class to store the configuration of a [`LoHaModel`].
+
+    Args:
+        r (`int`):
+            LoHa rank.
+        alpha (`int`):
+            The alpha parameter for LoHa scaling.
+        rank_dropout (`float`):
+            The dropout probability for rank dimension during training.
+        module_dropout (`float`):
+            The dropout probability for disabling LoHa modules during training.
+        use_effective_conv2d (`bool`):
+            Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper).
+        target_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
+            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
+            strings, either an exact match will be performed or it is checked if the name of the module ends with any
+            of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen,
+            excluding the output layer. If this is not specified, modules will be chosen according to the model
+            architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
+            the target modules manually.
+        init_weights (`bool`):
+            Whether to perform initialization of adapter weights. This defaults to `True`, passing `False` is
+            discouraged.
+        layers_to_transform (`Union[List[int], int]`):
+            The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
+            that are specified in this list. If a single integer is passed, it will apply the transformations on the
+            layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None`.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        alpha_pattern (`dict`):
+            The mapping from layer names or regexp expression to alphas which are different from the default alpha
+            specified by `alpha`.
+        modules_to_save (`Optional[List[str]]`):
+            List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
+    """
+
+    r: int = field(default=8, metadata={"help": "LoHa rank"})
+    alpha: int = field(default=8, metadata={"help": "LoHa alpha"})
+    rank_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for rank dimension during training"}
+    )
+    module_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for disabling LoHa modules during training"}
+    )
+    use_effective_conv2d: bool = field(
+        default=False,
+        metadata={
+            "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)'
+        },
+    )
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with LoHa."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+            "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the LoHa layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from LoHA layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.LOHA
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
diff --git a/MoRA/peft_mora/tuners/loha/layer.py b/MoRA/peft_mora/tuners/loha/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b782f9dea544513b6a7e10b37a2061f223060d2
--- /dev/null
+++ b/MoRA/peft_mora/tuners/loha/layer.py
@@ -0,0 +1,375 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Set, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from peft_mora.tuners.lycoris_utils import LycorisLayer
+
+
+class LoHaLayer(nn.Module, LycorisLayer):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("hada_w1_a", "hada_w1_b", "hada_w2_a", "hada_w2_b", "hada_t1", "hada_t2")
+    # other_param_names is defined on parent class
+
+    def __init__(self, base_layer: nn.Module):
+        super().__init__()
+        LycorisLayer.__init__(self, base_layer)
+
+        # LoHa info
+        self.hada_w1_a = nn.ParameterDict({})
+        self.hada_w1_b = nn.ParameterDict({})
+        self.hada_w2_a = nn.ParameterDict({})
+        self.hada_w2_b = nn.ParameterDict({})
+        self.hada_t1 = nn.ParameterDict({})
+        self.hada_t2 = nn.ParameterDict({})
+
+    @property
+    def _available_adapters(self) -> Set[str]:
+        return {*self.hada_w1_a, *self.hada_w1_b, *self.hada_w2_a, *self.hada_w2_b, *self.hada_t1, *self.hada_t2}
+
+    def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...]):
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L130C9-L143C75
+        if len(shape) == 4:
+            self.hada_t1[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
+            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0]))  # out_dim, 1-mode
+            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))  # in_dim , 2-mode
+
+            self.hada_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
+            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0]))  # out_dim, 1-mode
+            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))  # in_dim , 2-mode
+        else:
+            self.hada_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r))
+            self.hada_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))
+
+            self.hada_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0], r))
+            self.hada_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1]))
+
+    def reset_adapter_parameters(self, adapter_name: str):
+        # Original implementation performs initialization with normal distribution
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158
+
+        # FedPara paper proposes to perform He initialization, let's stick with it
+        # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization
+        if adapter_name in self.hada_w1_a.keys():
+            nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5))
+            nn.init.zeros_(self.hada_w2_b[adapter_name])
+        if adapter_name in self.hada_t1.keys():
+            nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5))
+
+    def reset_adapter_parameters_random(self, adapter_name: str):
+        # Original implementation performs initialization with normal distribution
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/3549fdef8f564761d68b695a08ef88b1122fdedc/lycoris/modules/loha.py#L158
+
+        # FedPara paper proposes to perform He initialization, let's stick with it
+        # It is enough to initialize only single matrix with zeros to make adapter do nothing after initialization
+        if adapter_name in self.hada_w1_a.keys():
+            nn.init.kaiming_uniform_(self.hada_w1_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w1_b[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w2_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_w2_b[adapter_name], a=math.sqrt(5))
+        if adapter_name in self.hada_t1.keys():
+            nn.init.kaiming_uniform_(self.hada_t1[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.hada_t2[adapter_name], a=math.sqrt(5))
+
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        alpha: float,
+        rank_dropout: float,
+        module_dropout: float,
+        init_weights: bool,
+        use_effective_conv2d: bool = False,
+        **kwargs,
+    ) -> None:
+        """Internal function to create loha adapter
+
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            alpha (`float`): Alpha for the added adapter.
+            rank_dropout (`float`): The dropout probability for rank dimension during training.
+            module_dropout (`float`): The dropout probability for disabling adapter during training.
+            init_weights (`bool`): Whether to initialize weights.
+            use_effective_conv2d (`bool`, *optional*, defaults to `False`):
+                Use parameter effective decomposition for Conv2d with ksize > 1.
+        """
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+
+        self.r[adapter_name] = r
+        self.alpha[adapter_name] = alpha
+        self.scaling[adapter_name] = alpha / r
+        self.rank_dropout[adapter_name] = rank_dropout
+        self.module_dropout[adapter_name] = module_dropout
+
+        # Determine shape of LoHa weights
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            shape = tuple(base_layer.weight.shape)
+        elif isinstance(base_layer, nn.Conv2d):
+            use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1)
+            if use_effective_conv2d:
+                shape = (base_layer.out_channels, base_layer.in_channels, *base_layer.kernel_size)
+            else:
+                shape = (
+                    base_layer.out_channels,
+                    base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                )
+        else:
+            raise TypeError(f"LoHa is not implemented for base layers of type {type(base_layer).__name__}")
+
+        # Create weights with provided shape
+        self.create_adapter_parameters(adapter_name, r, shape)
+
+        # Initialize weights
+        if init_weights:
+            self.reset_adapter_parameters(adapter_name)
+        else:
+            self.reset_adapter_parameters_random(adapter_name)
+
+        # Move new weights to device
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L178
+        if adapter_name in self.hada_t1.keys():
+            weight = make_weight_cp(
+                self.hada_t1[adapter_name],
+                self.hada_w1_a[adapter_name],
+                self.hada_w1_b[adapter_name],
+                self.hada_t2[adapter_name],
+                self.hada_w2_a[adapter_name],
+                self.hada_w2_b[adapter_name],
+                scale=torch.tensor(self.scaling[adapter_name]),
+            )
+        else:
+            weight = make_weight(
+                self.hada_w1_a[adapter_name],
+                self.hada_w1_b[adapter_name],
+                self.hada_w2_a[adapter_name],
+                self.hada_w2_b[adapter_name],
+                scale=torch.tensor(self.scaling[adapter_name]),
+            )
+
+        base_layer = self.get_base_layer()
+        weight = weight.reshape(base_layer.weight.shape)
+
+        # Perform rank dropout during training - drop rows of addition weights
+        rank_dropout = self.rank_dropout[adapter_name]
+        if self.training and rank_dropout:
+            drop = (torch.rand(weight.size(0)) > rank_dropout).to(weight.dtype)
+            drop = drop.view(-1, *[1] * len(weight.shape[1:])).to(weight.device)
+            # TODO: Investigate if there should be a scaler like in normal dropout during training
+            # Original implementation doesn't have it
+            # https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L193
+            drop /= drop.mean()
+            weight *= drop
+
+        return weight
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+
+                module_dropout = self.module_dropout[active_adapter]
+
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    result = result + self._get_delta_activations(active_adapter, x, *args, **kwargs)
+
+        result = result.to(previous_dtype)
+        return result
+
+
+class Linear(LoHaLayer):
+    """LoHa implemented in Linear layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs)
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        # don't add bias here, because the bias is already included in the output of the base_layer
+        return F.linear(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "loha." + rep
+
+
+class Conv2d(LoHaLayer):
+    """LoHa implemented in Conv2d layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        use_effective_conv2d: bool = False,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(
+            adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs
+        )
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        # don't add bias here, because the bias is already included in the output of the base_layer
+        base_layer = self.get_base_layer()
+        return F.conv2d(
+            input,
+            delta_weight,
+            stride=base_layer.stride,
+            padding=base_layer.padding,
+            dilation=base_layer.dilation,
+            groups=base_layer.groups,
+        )
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "loha." + rep
+
+
+# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py#L9
+
+
+class HadaWeight(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, w1a, w1b, w2a, w2b, scale=torch.tensor(1)):
+        ctx.save_for_backward(w1a, w1b, w2a, w2b, scale)
+        diff_weight = ((w1a @ w1b) * (w2a @ w2b)) * scale
+        return diff_weight
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        (w1a, w1b, w2a, w2b, scale) = ctx.saved_tensors
+        grad_out = grad_out * scale
+        temp = grad_out * (w2a @ w2b)
+        grad_w1a = temp @ w1b.T
+        grad_w1b = w1a.T @ temp
+
+        temp = grad_out * (w1a @ w1b)
+        grad_w2a = temp @ w2b.T
+        grad_w2b = w2a.T @ temp
+
+        del temp
+        return grad_w1a, grad_w1b, grad_w2a, grad_w2b, None
+
+
+class HadaWeightCP(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, t1, w1a, w1b, t2, w2a, w2b, scale=torch.tensor(1)):
+        ctx.save_for_backward(t1, w1a, w1b, t2, w2a, w2b, scale)
+
+        rebuild1 = torch.einsum("i j k l, j r, i p -> p r k l", t1, w1b, w1a)
+        rebuild2 = torch.einsum("i j k l, j r, i p -> p r k l", t2, w2b, w2a)
+
+        return rebuild1 * rebuild2 * scale
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        (t1, w1a, w1b, t2, w2a, w2b, scale) = ctx.saved_tensors
+        grad_out = grad_out * scale
+
+        temp = torch.einsum("i j k l, j r -> i r k l", t2, w2b)
+        rebuild = torch.einsum("i j k l, i r -> r j k l", temp, w2a)
+
+        grad_w = rebuild * grad_out
+        del rebuild
+
+        grad_w1a = torch.einsum("r j k l, i j k l -> r i", temp, grad_w)
+        grad_temp = torch.einsum("i j k l, i r -> r j k l", grad_w, w1a.T)
+        del grad_w, temp
+
+        grad_w1b = torch.einsum("i r k l, i j k l -> r j", t1, grad_temp)
+        grad_t1 = torch.einsum("i j k l, j r -> i r k l", grad_temp, w1b.T)
+        del grad_temp
+
+        temp = torch.einsum("i j k l, j r -> i r k l", t1, w1b)
+        rebuild = torch.einsum("i j k l, i r -> r j k l", temp, w1a)
+
+        grad_w = rebuild * grad_out
+        del rebuild
+
+        grad_w2a = torch.einsum("r j k l, i j k l -> r i", temp, grad_w)
+        grad_temp = torch.einsum("i j k l, i r -> r j k l", grad_w, w2a.T)
+        del grad_w, temp
+
+        grad_w2b = torch.einsum("i r k l, i j k l -> r j", t2, grad_temp)
+        grad_t2 = torch.einsum("i j k l, j r -> i r k l", grad_temp, w2b.T)
+        del grad_temp
+        return grad_t1, grad_w1a, grad_w1b, grad_t2, grad_w2a, grad_w2b, None
+
+
+def make_weight(w1a, w1b, w2a, w2b, scale):
+    return HadaWeight.apply(w1a, w1b, w2a, w2b, scale)
+
+
+def make_weight_cp(t1, w1a, w1b, t2, w2a, w2b, scale):
+    return HadaWeightCP.apply(t1, w1a, w1b, t2, w2a, w2b, scale)
diff --git a/MoRA/peft_mora/tuners/loha/model.py b/MoRA/peft_mora/tuners/loha/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..25315a337a222b703017f329ac0dda7584ab8edd
--- /dev/null
+++ b/MoRA/peft_mora/tuners/loha/model.py
@@ -0,0 +1,114 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from itertools import chain
+from typing import Dict, Type, Union
+
+import torch
+from torch import nn
+
+from peft_mora.tuners.lycoris_utils import LycorisConfig, LycorisTuner
+
+from .layer import Conv2d, Linear, LoHaLayer
+
+
+class LoHaModel(LycorisTuner):
+    """
+    Creates Low-Rank Hadamard Product model from a pretrained model. The method is partially described in
+    https://arxiv.org/abs/2108.06098 Current implementation heavily borrows from
+    https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/loha.py
+
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`LoHaConfig`]): The configuration of the LoHa model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The LoHa model.
+
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import LoHaModel, LoHaConfig
+
+        >>> config_te = LoHaConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = LoHaConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ...     use_effective_conv2d=True,
+        ... )
+
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = LoHaModel(model.text_encoder, config_te, "default")
+        >>> model.unet = LoHaModel(model.unet, config_unet, "default")
+        ```
+
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`LoHaConfig`]): The configuration of the LoHa model.
+    """
+
+    prefix: str = "hada_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[LoHaLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
+
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[LoHaLayer, nn.Module],
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+    ) -> None:
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(chain(config.rank_pattern.keys(), config.alpha_pattern.keys()))
+        target_name_key = next(filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+        kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha)
+
+        if isinstance(target, LoHaLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)
diff --git a/MoRA/peft_mora/tuners/lokr/__init__.py b/MoRA/peft_mora/tuners/lokr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..408cf2a54ae4c0befa9e3f1cad4ff93d71cfedc5
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lokr/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import LoKrConfig
+from .layer import Conv2d, Linear, LoKrLayer
+from .model import LoKrModel
+
+
+__all__ = ["LoKrConfig", "LoKrModel", "Conv2d", "Linear", "LoKrLayer"]
diff --git a/MoRA/peft_mora/tuners/lokr/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/lokr/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b00d76dc4698178e3a4696eeec41816f8fc6c3b
Binary files /dev/null and b/MoRA/peft_mora/tuners/lokr/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lokr/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/lokr/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..093a1b7eed661c5f54a0ca30214f9e76df468fb2
Binary files /dev/null and b/MoRA/peft_mora/tuners/lokr/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lokr/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/lokr/__pycache__/layer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87d07bd1c43879ed27bcbf76ee2c927b7a5943c9
Binary files /dev/null and b/MoRA/peft_mora/tuners/lokr/__pycache__/layer.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lokr/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/lokr/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b9f40b1916776cc459f1c7efd38ca1f5ad7c34e
Binary files /dev/null and b/MoRA/peft_mora/tuners/lokr/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lokr/config.py b/MoRA/peft_mora/tuners/lokr/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..43ed67529b2f17d289d07908ce66230ccd1b5522
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lokr/config.py
@@ -0,0 +1,127 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft_mora.tuners.lycoris_utils import LycorisConfig
+from peft_mora.utils import PeftType
+
+
+@dataclass
+class LoKrConfig(LycorisConfig):
+    """
+    Configuration class of [`LoKrModel`].
+
+    Args:
+        r (`int`):
+            LoKr rank.
+        alpha (`int`):
+            The alpha parameter for LoKr scaling.
+        rank_dropout (`float`):
+            The dropout probability for rank dimension during training.
+        module_dropout (`float`):
+            The dropout probability for disabling LoKr modules during training.
+        use_effective_conv2d (`bool`):
+            Use parameter effective decomposition for Conv2d with ksize > 1 ("Proposition 3" from FedPara paper).
+        decompose_both (`bool`):
+            Perform rank decomposition of left kronecker product matrix.
+        decompose_factor (`int`):
+            Kronecker product decomposition factor.
+        target_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
+            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
+            strings, either an exact match will be performed or it is checked if the name of the module ends with any
+            of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen,
+            excluding the output layer. If this is not specified, modules will be chosen according to the model
+            architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
+            the target modules manually.
+        init_weights (`bool`):
+            Whether to perform initialization of adapter weights. This defaults to `True`, passing `False` is
+            discouraged.
+        layers_to_transform (`Union[List[int], int]`):
+            The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
+            that are specified in this list. If a single integer is passed, it will apply the transformations on the
+            layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None`.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        alpha_pattern (`dict`):
+            The mapping from layer names or regexp expression to alphas which are different from the default alpha
+            specified by `alpha`.
+        modules_to_save (`Optional[List[str]]`):
+            List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
+    """
+
+    r: int = field(default=8, metadata={"help": "LoKr rank"})
+    alpha: int = field(default=8, metadata={"help": "LoKr alpha"})
+    rank_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for rank dimension during training"}
+    )
+    module_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for disabling LoKr modules during training"}
+    )
+    use_effective_conv2d: bool = field(
+        default=False,
+        metadata={
+            "help": 'Use parameter effective decomposition for Conv2d 3x3 with ksize > 1 ("Proposition 3" from FedPara paper)'
+        },
+    )
+    decompose_both: bool = field(
+        default=False,
+        metadata={"help": "Perform rank decomposition of left kronecker product matrix."},
+    )
+    decompose_factor: int = field(default=-1, metadata={"help": "Kronecker product decomposition factor."})
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with LoKr."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+            "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the LoKr layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from LoKr layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.LOKR
diff --git a/MoRA/peft_mora/tuners/lokr/layer.py b/MoRA/peft_mora/tuners/lokr/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a961d19440cb3ea10386b6ca77d79a093c34016
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lokr/layer.py
@@ -0,0 +1,409 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Optional, Set, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from peft_mora.tuners.lycoris_utils import LycorisLayer
+
+
+class LoKrLayer(nn.Module, LycorisLayer):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = (
+        "lokr_w1",
+        "lokr_w1_a",
+        "lokr_w1_b",
+        "lokr_w2",
+        "lokr_w2_a",
+        "lokr_w2_b",
+        "lokr_t2",
+    )
+    # other_param_names is defined on parent class
+
+    def __init__(self, base_layer: nn.Module) -> None:
+        super().__init__()
+        LycorisLayer.__init__(self, base_layer)
+
+        # LoKr info
+        self.lokr_w1 = nn.ParameterDict({})
+        self.lokr_w1_a = nn.ParameterDict({})
+        self.lokr_w1_b = nn.ParameterDict({})
+        self.lokr_w2 = nn.ParameterDict({})
+        self.lokr_w2_a = nn.ParameterDict({})
+        self.lokr_w2_b = nn.ParameterDict({})
+        self.lokr_t2 = nn.ParameterDict({})
+
+    @property
+    def _available_adapters(self) -> Set[str]:
+        return {
+            *self.lokr_w1,
+            *self.lokr_w1_a,
+            *self.lokr_w1_b,
+            *self.lokr_w2,
+            *self.lokr_w2_a,
+            *self.lokr_w2_b,
+            *self.lokr_t2,
+        }
+
+    def create_adapter_parameters(
+        self,
+        adapter_name: str,
+        r: int,
+        shape,
+        use_w1: bool,
+        use_w2: bool,
+        use_effective_conv2d: bool,
+    ):
+        if use_w1:
+            self.lokr_w1[adapter_name] = nn.Parameter(torch.empty(shape[0][0], shape[1][0]))
+        else:
+            self.lokr_w1_a[adapter_name] = nn.Parameter(torch.empty(shape[0][0], r))
+            self.lokr_w1_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][0]))
+
+        if len(shape) == 4:
+            # Conv2d
+            if use_w2:
+                self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1], *shape[2:]))
+            elif use_effective_conv2d:
+                self.lokr_t2[adapter_name] = nn.Parameter(torch.empty(r, r, shape[2], shape[3]))
+                self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(r, shape[0][1]))  # b, 1-mode
+                self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1]))  # d, 2-mode
+            else:
+                self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r))
+                self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1] * shape[2] * shape[3]))
+        else:
+            # Linear
+            if use_w2:
+                self.lokr_w2[adapter_name] = nn.Parameter(torch.empty(shape[0][1], shape[1][1]))
+            else:
+                self.lokr_w2_a[adapter_name] = nn.Parameter(torch.empty(shape[0][1], r))
+                self.lokr_w2_b[adapter_name] = nn.Parameter(torch.empty(r, shape[1][1]))
+
+    def reset_adapter_parameters(self, adapter_name: str):
+        if adapter_name in self.lokr_w1:
+            nn.init.zeros_(self.lokr_w1[adapter_name])
+        else:
+            nn.init.zeros_(self.lokr_w1_a[adapter_name])
+            nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_w2:
+            nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5))
+        else:
+            nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_t2:
+            nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5))
+
+    def reset_adapter_parameters_random(self, adapter_name: str):
+        if adapter_name in self.lokr_w1:
+            nn.init.kaiming_uniform_(self.lokr_w1[adapter_name], a=math.sqrt(5))
+        else:
+            nn.init.kaiming_uniform_(self.lokr_w1_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.lokr_w1_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_w2:
+            nn.init.kaiming_uniform_(self.lokr_w2[adapter_name], a=math.sqrt(5))
+        else:
+            nn.init.kaiming_uniform_(self.lokr_w2_a[adapter_name], a=math.sqrt(5))
+            nn.init.kaiming_uniform_(self.lokr_w2_b[adapter_name], a=math.sqrt(5))
+
+        if adapter_name in self.lokr_t2:
+            nn.init.kaiming_uniform_(self.lokr_t2[adapter_name], a=math.sqrt(5))
+
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        alpha: float,
+        rank_dropout: float,
+        module_dropout: float,
+        init_weights: bool,
+        use_effective_conv2d: bool,
+        decompose_both: bool,
+        decompose_factor: int,
+        **kwargs,
+    ) -> None:
+        """Internal function to create lokr adapter
+
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            alpha (`float`): Alpha for the added adapter.
+            rank_dropout (`float`): The dropout probability for rank dimension during training
+            module_dropout (`float`): The dropout probability for disabling adapter during training.
+            init_weights (`bool`): Whether to initialize adapter weights.
+            use_effective_conv2d (`bool`): Use parameter effective decomposition for Conv2d with ksize > 1.
+            decompose_both (`bool`): Perform rank decomposition of left kronecker product matrix.
+            decompose_factor (`int`): Kronecker product decomposition factor.
+        """
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+
+        self.r[adapter_name] = r
+        self.alpha[adapter_name] = alpha
+        self.scaling[adapter_name] = alpha / r
+        self.rank_dropout[adapter_name] = rank_dropout
+        self.module_dropout[adapter_name] = module_dropout
+        base_layer = self.get_base_layer()
+
+        # Determine shape of LoKr weights
+        if isinstance(base_layer, nn.Linear):
+            in_dim, out_dim = base_layer.in_features, base_layer.out_features
+
+            in_m, in_n = factorization(in_dim, decompose_factor)
+            out_l, out_k = factorization(out_dim, decompose_factor)
+            shape = ((out_l, out_k), (in_m, in_n))  # ((a, b), (c, d)), out_dim = a*c, in_dim = b*d
+
+            use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
+            use_w2 = not (r < max(shape[0][1], shape[1][1]) / 2)
+            use_effective_conv2d = False
+        elif isinstance(base_layer, nn.Conv2d):
+            in_dim, out_dim = base_layer.in_channels, base_layer.out_channels
+            k_size = base_layer.kernel_size
+
+            in_m, in_n = factorization(in_dim, decompose_factor)
+            out_l, out_k = factorization(out_dim, decompose_factor)
+            shape = ((out_l, out_k), (in_m, in_n), *k_size)  # ((a, b), (c, d), *k_size)
+
+            use_w1 = not (decompose_both and r < max(shape[0][0], shape[1][0]) / 2)
+            use_w2 = r >= max(shape[0][1], shape[1][1]) / 2
+            use_effective_conv2d = use_effective_conv2d and base_layer.kernel_size != (1, 1)
+        else:
+            raise TypeError(f"LoKr is not implemented for base layers of type {type(base_layer).__name__}")
+
+        # Create weights with provided shape
+        self.create_adapter_parameters(adapter_name, r, shape, use_w1, use_w2, use_effective_conv2d)
+
+        # Initialize weights
+        if init_weights:
+            self.reset_adapter_parameters(adapter_name)
+        else:
+            self.reset_adapter_parameters_random(adapter_name)
+
+        # Move new weights to device
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        # https://github.com/KohakuBlueleaf/LyCORIS/blob/e4259b870d3354a9615a96be61cb5d07455c58ea/lycoris/modules/lokr.py#L224
+        if adapter_name in self.lokr_w1:
+            w1 = self.lokr_w1[adapter_name]
+        else:
+            w1 = self.lokr_w1_a[adapter_name] @ self.lokr_w1_b[adapter_name]
+
+        if adapter_name in self.lokr_w2:
+            w2 = self.lokr_w2[adapter_name]
+        elif adapter_name in self.lokr_t2:
+            w2 = make_weight_cp(self.lokr_t2[adapter_name], self.lokr_w2_a[adapter_name], self.lokr_w2_b[adapter_name])
+        else:
+            w2 = self.lokr_w2_a[adapter_name] @ self.lokr_w2_b[adapter_name]
+
+        # Make weights with Kronecker product
+        weight = make_kron(w1, w2)
+        weight = weight.reshape(self.get_base_layer().weight.shape)
+
+        # Perform rank dropout during training - drop rows of addition weights
+        rank_dropout = self.rank_dropout[adapter_name]
+        if self.training and rank_dropout:
+            drop = (torch.rand(weight.size(0)) > rank_dropout).float()
+            drop = drop.view(-1, *[1] * len(weight.shape[1:])).to(weight.device)
+            drop /= drop.mean()
+            weight *= drop
+
+        return weight
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+
+                module_dropout = self.module_dropout[active_adapter]
+
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    result = result + self._get_delta_activations(active_adapter, x, *args, **kwargs)
+
+        result = result.to(previous_dtype)
+        return result
+
+
+class Linear(LoKrLayer):
+    """LoKr implemented in Linear layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, **kwargs)
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        # don't add bias here, because the bias is already included in the output of the base_layer
+        return F.linear(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lokr." + rep
+
+
+class Conv2d(LoKrLayer):
+    """LoKr implemented in Conv2d layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        device: Optional[Union[str, torch.device]] = None,
+        dtype: Optional[torch.dtype] = None,
+        adapter_name: str = "default",
+        r: int = 0,
+        alpha: float = 0.0,
+        rank_dropout: float = 0.0,
+        module_dropout: float = 0.0,
+        use_effective_conv2d: bool = False,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(
+            adapter_name, r, alpha, rank_dropout, module_dropout, init_weights, use_effective_conv2d, **kwargs
+        )
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+        # don't add bias here, because the bias is already included in the output of the base_layer
+        base_layer = self.get_base_layer()
+        return F.conv2d(
+            input,
+            delta_weight,
+            stride=base_layer.stride,
+            padding=base_layer.padding,
+            dilation=base_layer.dilation,
+            groups=base_layer.groups,
+        )
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lokr." + rep
+
+
+# Below code is a direct copy from https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py#L11
+
+
+def factorization(dimension: int, factor: int = -1) -> Tuple[int, int]:
+    """Factorizes the provided number into the product of two numbers
+
+    Args:
+        dimension (`int`): The number that needs to be factorized.
+        factor (`int`, optional):
+            Factorization divider. The algorithm will try to output two numbers, one of each will be as close to the
+            factor as possible. If -1 is provided, the decomposition algorithm would try to search dividers near the
+            square root of the dimension. Defaults to -1.
+
+    Returns:
+        Tuple[`int`, `int`]: A tuple of two numbers, whose product is equal to the provided number. The first number is
+        always less than or equal to the second.
+
+    Example:
+        ```py
+        >>> factorization(256, factor=-1)
+        (16, 16)
+
+        >>> factorization(128, factor=-1)
+        (8, 16)
+
+        >>> factorization(127, factor=-1)
+        (1, 127)
+
+        >>> factorization(128, factor=4)
+        (4, 32)
+        ```
+    """
+
+    if factor > 0 and (dimension % factor) == 0:
+        m = factor
+        n = dimension // factor
+        return m, n
+    if factor == -1:
+        factor = dimension
+    m, n = 1, dimension
+    length = m + n
+    while m < n:
+        new_m = m + 1
+        while dimension % new_m != 0:
+            new_m += 1
+        new_n = dimension // new_m
+        if new_m + new_n > length or new_m > factor:
+            break
+        else:
+            m, n = new_m, new_n
+    if m > n:
+        n, m = m, n
+    return m, n
+
+
+def make_weight_cp(t, wa, wb):
+    rebuild2 = torch.einsum("i j k l, i p, j r -> p r k l", t, wa, wb)  # [c, d, k1, k2]
+    return rebuild2
+
+
+def make_kron(w1, w2, scale=1.0):
+    if len(w2.shape) == 4:
+        w1 = w1.unsqueeze(2).unsqueeze(2)
+    w2 = w2.contiguous()
+    rebuild = torch.kron(w1, w2)
+
+    return rebuild * scale
diff --git a/MoRA/peft_mora/tuners/lokr/model.py b/MoRA/peft_mora/tuners/lokr/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3e1c930b3fb7b7889df6a62b0fe84d4d0f0eaa3
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lokr/model.py
@@ -0,0 +1,115 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from itertools import chain
+from typing import Dict, Type, Union
+
+import torch
+from torch import nn
+
+from peft_mora.tuners.lycoris_utils import LycorisConfig, LycorisTuner
+
+from .layer import Conv2d, Linear, LoKrLayer
+
+
+class LoKrModel(LycorisTuner):
+    """
+    Creates Low-Rank Kronecker Product model from a pretrained model. The original method is partially described in
+    https://arxiv.org/abs/2108.06098 and in https://arxiv.org/abs/2309.14859 Current implementation heavily borrows
+    from
+    https://github.com/KohakuBlueleaf/LyCORIS/blob/eb460098187f752a5d66406d3affade6f0a07ece/lycoris/modules/lokr.py
+
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`LoKrConfig`]): The configuration of the LoKr model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The LoKr model.
+
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import LoKrModel, LoKrConfig
+
+        >>> config_te = LoKrConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = LoKrConfig(
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     rank_dropout=0.0,
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ...     use_effective_conv2d=True,
+        ... )
+
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = LoKrModel(model.text_encoder, config_te, "default")
+        >>> model.unet = LoKrModel(model.unet, config_unet, "default")
+        ```
+
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`LoKrConfig`]): The configuration of the LoKr model.
+    """
+
+    prefix: str = "lokr_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[LoKrLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
+
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[LoKrLayer, nn.Module],
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+    ) -> None:
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(chain(config.rank_pattern.keys(), config.alpha_pattern.keys()))
+        target_name_key = next(filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+        kwargs["alpha"] = config.alpha_pattern.get(target_name_key, config.alpha)
+
+        if isinstance(target, LoKrLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)
diff --git a/MoRA/peft_mora/tuners/lora.py b/MoRA/peft_mora/tuners/lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/MoRA/peft_mora/tuners/lora/__init__.py b/MoRA/peft_mora/tuners/lora/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a81e5f36933d8f898acb72edee441b3e3d4537d
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lora/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+
+from .config import LoftQConfig, LoraConfig
+from .gptq import QuantLinear
+from .layer import Conv2d, Embedding, Linear, LoraLayer
+from .model import LoraModel
+
+
+__all__ = ["LoraConfig", "LoftQConfig", "Conv2d", "Embedding", "LoraLayer", "Linear", "LoraModel", "QuantLinear"]
+
+
+def __getattr__(name):
+    if (name == "Linear8bitLt") and is_bnb_available():
+        from .bnb import Linear8bitLt
+
+        return Linear8bitLt
+
+    if (name == "Linear4bit") and is_bnb_4bit_available():
+        from .bnb import Linear4bit
+
+        return Linear4bit
+
+    raise AttributeError(f"module {__name__} has no attribute {name}")
diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df371c5eba4279f6ba9a4e8cc903d6a51c902000
Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/aqlm.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/aqlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87581be9e84c5043d9e880713bc2b79c7d3b5cf4
Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/aqlm.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/awq.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/awq.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edc3a080105b1f0b95c460e00428a06954b29c04
Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/awq.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/bnb.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/bnb.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d89f91ff3ccb16f1c76816369ee09d4bf64a10c4
Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/bnb.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0d5a321b3b9c91688c786754ba4f17aa3a19593
Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/gptq.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/gptq.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57ab2777599acd805c821caa445dd48bc83c875d
Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/gptq.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/layer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ab79867105c22f997466ea40f3658183775c5c7
Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/layer.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1235d0ca7be5bd6c6b13e7e8d10ebdc0916e9d61
Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lora/__pycache__/tp_layer.cpython-312.pyc b/MoRA/peft_mora/tuners/lora/__pycache__/tp_layer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05fe836dd5476a51ea5764cbe304d176036cd4ae
Binary files /dev/null and b/MoRA/peft_mora/tuners/lora/__pycache__/tp_layer.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/lora/aqlm.py b/MoRA/peft_mora/tuners/lora/aqlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b359b3c86a6db5b643928e4f52e5fe6f72bd421
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lora/aqlm.py
@@ -0,0 +1,100 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional
+
+import torch
+
+from peft_mora.import_utils import is_aqlm_available
+from peft_mora.tuners.lora.layer import LoraLayer
+from peft_mora.tuners.tuners_utils import BaseTunerLayer
+
+
+if is_aqlm_available():
+    from aqlm import QuantizedLinear
+
+
+class AqlmLoraLinear(torch.nn.Module, LoraLayer):
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        init_lora_weights: bool = True,
+        use_rslora: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        LoraLayer.__init__(self, base_layer)
+
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora)
+
+    def forward(self, x: torch.Tensor):
+        # note: logic differs from default Linear because merging is not supported
+        result = self.base_layer(x)
+
+        if self.disable_adapters:
+            return result
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                x = x.to(lora_A.weight.dtype)
+
+            output = lora_B(lora_A(dropout(x)))
+            if requires_conversion:
+                output = output.to(expected_dtype)
+            output = output * scaling
+            result += output
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
+
+    # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102
+    # def reset_lora_parameters(self, adapter_name):
+    #     if adapter_name in self.lora_A.keys():
+    #         torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
+    #         torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
+
+
+def dispatch_aqlm(
+    target: torch.nn.Module,
+    adapter_name: str,
+    **kwargs: Any,
+) -> Optional[torch.nn.Module]:
+    new_module = None
+
+    if isinstance(target, BaseTunerLayer):
+        target_base_layer = target.get_base_layer()
+    else:
+        target_base_layer = target
+
+    if is_aqlm_available() and isinstance(target_base_layer, QuantizedLinear):
+        new_module = AqlmLoraLinear(target, adapter_name, **kwargs)
+        target.qweight = target_base_layer.codes
+
+    return new_module
diff --git a/MoRA/peft_mora/tuners/lora/awq.py b/MoRA/peft_mora/tuners/lora/awq.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ad45be6c9e7b88ff2856ae66e851e56f5da0f39
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lora/awq.py
@@ -0,0 +1,108 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib.metadata as importlib_metadata
+from typing import Any, Optional
+
+import packaging.version
+import torch
+
+from peft_mora.import_utils import is_auto_awq_available
+from peft_mora.tuners.lora.layer import LoraLayer
+from peft_mora.tuners.tuners_utils import BaseTunerLayer
+
+
+if is_auto_awq_available():
+    from awq.modules.linear import WQLinear_GEMM
+
+
+class AwqLoraLinear(torch.nn.Module, LoraLayer):
+    def __init__(
+        self,
+        base_layer,
+        adapter_name,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        init_lora_weights: bool = True,
+        use_rslora: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        LoraLayer.__init__(self, base_layer)
+
+        # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter
+        # for backwards compatibility
+        self.quant_linear_module = base_layer
+
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora)
+
+    def forward(self, x: torch.Tensor):
+        result = self.quant_linear_module(x)
+
+        if self.disable_adapters:
+            return result
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                x = x.to(lora_A.weight.dtype)
+
+            output = lora_B(lora_A(dropout(x)))
+            if requires_conversion:
+                output = output.to(expected_dtype)
+            output = output * scaling
+            result = result + output
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
+
+
+def dispatch_awq(
+    target: torch.nn.Module,
+    adapter_name: str,
+    **kwargs: Any,
+) -> Optional[torch.nn.Module]:
+    new_module = None
+
+    if isinstance(target, BaseTunerLayer):
+        target_base_layer = target.get_base_layer()
+    else:
+        target_base_layer = target
+
+    if is_auto_awq_available() and isinstance(target_base_layer, WQLinear_GEMM):
+        # Raise the error only at the dispatch level
+        AUTOAWQ_MINIMUM_VERSION = packaging.version.parse("0.2.0")
+        version_autoawq = packaging.version.parse(importlib_metadata.version("autoawq"))
+
+        if AUTOAWQ_MINIMUM_VERSION > version_autoawq:
+            raise ImportError(
+                f"Found an incompatible version of auto-awq. Found version {version_autoawq}, "
+                f"but only versions above {AUTOAWQ_MINIMUM_VERSION} are supported for PEFT."
+            )
+
+        new_module = AwqLoraLinear(target, adapter_name, **kwargs)
+        target.qweight = target_base_layer.qweight
+
+    return new_module
diff --git a/MoRA/peft_mora/tuners/lora/bnb.py b/MoRA/peft_mora/tuners/lora/bnb.py
new file mode 100644
index 0000000000000000000000000000000000000000..605a50b783d3b80e4871adaf5bec88151110751b
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lora/bnb.py
@@ -0,0 +1,399 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import List, Optional
+
+import bitsandbytes as bnb
+import torch
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft_mora.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+from peft_mora.utils.other import transpose
+
+from .layer import LoraLayer
+
+
+if is_bnb_available():
+
+    class Linear8bitLt(torch.nn.Module, LoraLayer):
+        # Lora implemented in a dense layer
+        def __init__(
+            self,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
+            use_rslora: bool = False,
+            use_dora: bool = False,
+            use_mora: bool = False,
+            mora_type: int = 1,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            LoraLayer.__init__(self, base_layer)
+
+            if use_dora:
+                raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False")
+
+            self._active_adapter = adapter_name
+            self.update_layer(
+                adapter_name,
+                r,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                init_lora_weights=init_lora_weights,
+                use_rslora=use_rslora,
+                use_dora=use_dora,
+                use_mora=use_mora,
+                mora_type=mora_type,
+            )
+
+        def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+            """
+            Merge the active adapter weights into the base weights
+
+            Args:
+                safe_merge (`bool`, *optional*):
+                    If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                    before merging the weights. This is useful if you want to check if the merge operation will produce
+                    NaNs. Defaults to `False`.
+                adapter_names (`List[str]`, *optional*):
+                    The list of adapter names that should be merged. If None, all active adapters will be merged.
+                    Defaults to `None`.
+            """
+            adapter_names = check_adapters_to_merge(self, adapter_names)
+            if not adapter_names:
+                # no adapter to merge
+                return
+
+            for active_adapter in adapter_names:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                warnings.warn(
+                    "Merge lora module to 8-bit linear may get different generations due to rounding errors."
+                )
+                lora_data = self.get_delta_weight(active_adapter)
+
+                weight = self.get_base_layer().weight
+                state = self.get_base_layer().state
+                if state.SCB is None:
+                    state.SCB = weight.SCB
+
+                # Dequantize the result of identity matrix and int8 weight because bitsandbytes does not support int8
+                # dequantization directly
+                im = torch.eye(weight.data.shape[-1]).contiguous().half().to(weight.device)
+                im, imt, SCim, SCimt, coo_tensorim = bnb.functional.double_quant(im)
+                im, Sim = bnb.functional.transform(im, "col32")
+                if state.CxB is None:
+                    state.CxB, state.SB = bnb.functional.transform(weight.data, to_order=state.formatB)
+                out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB)
+                output = bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t()
+
+                w_data = output.to(lora_data.dtype).to(lora_data.device) + lora_data
+                if safe_merge and not torch.isfinite(w_data).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+
+                self.get_base_layer().weight = bnb.nn.Int8Params(
+                    w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights
+                ).to(weight.device)
+                state.reset_grads()
+                self.merged_adapters.append(active_adapter)
+
+        def unmerge(self) -> None:
+            """
+            This method unmerges all merged adapter layers from the base weights.
+            """
+            if not self.merged:
+                warnings.warn("Already unmerged. Nothing to do.")
+                return
+
+            while len(self.merged_adapters) > 0:
+                active_adapter = self.merged_adapters.pop()
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                warnings.warn(
+                    "Unmerge lora module to 8-bit linear may get different generations due to rounding errors."
+                )
+                lora_data = self.get_delta_weight(active_adapter)
+
+                weight = self.get_base_layer().weight
+                state = self.get_base_layer().state
+                if state.SCB is None:
+                    state.SCB = weight.SCB
+                im = torch.eye(weight.data.shape[-1]).contiguous().half().to(weight.device)
+                im, imt, SCim, SCimt, coo_tensorim = bnb.functional.double_quant(im)
+                im, Sim = bnb.functional.transform(im, "col32")
+
+                if state.CxB is None:
+                    state.CxB, state.SB = bnb.functional.transform(weight.data, to_order=state.formatB)
+                out32, Sout32 = bnb.functional.igemmlt(im, state.CxB, Sim, state.SB)
+                output = bnb.functional.mm_dequant(out32, Sout32, SCim, state.SCB, bias=None).t()
+
+                w_data = output.to(lora_data.dtype).to(lora_data.device) - lora_data
+                self.get_base_layer().weight = bnb.nn.Int8Params(
+                    w_data.to("cpu"), requires_grad=False, has_fp16_weights=weight.has_fp16_weights
+                ).to(weight.device)
+                state.reset_grads()
+
+        def get_delta_weight(self, adapter):
+            return (
+                transpose(
+                    self.lora_B[adapter].weight @ self.lora_A[adapter].weight,
+                    False,
+                )
+                * self.scaling[adapter]
+            )
+
+        def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+            if self.disable_adapters:
+                if self.merged:
+                    self.unmerge()
+                result = self.base_layer(x, *args, **kwargs)
+            elif self.merged:
+                result = self.base_layer(x, *args, **kwargs)
+            else:
+                result = self.base_layer(x, *args, **kwargs)
+                for active_adapter in self.active_adapters:
+                    if active_adapter not in self.lora_A.keys():
+                        continue
+                    lora_A = self.lora_A[active_adapter]
+                    lora_B = self.lora_B[active_adapter]
+                    dropout = self.lora_dropout[active_adapter]
+                    scaling = self.scaling[active_adapter]
+
+                    requires_conversion = not torch.is_autocast_enabled()
+                    if requires_conversion:
+                        expected_dtype = result.dtype
+                        compute_dtype = lora_A.weight.dtype
+                        if x.dtype != compute_dtype:
+                            x = x.to(compute_dtype)
+                    output = lora_B(lora_A(dropout(x)))
+                    if requires_conversion:
+                        output = output.to(expected_dtype)
+                    output = output * scaling
+                    result = result + output
+
+            return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "lora." + rep
+
+    def dispatch_bnb_8bit(target: torch.nn.Module, adapter_name: str, **kwargs):
+        new_module = None
+
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        loaded_in_8bit = kwargs.get("loaded_in_8bit", False)
+        if loaded_in_8bit and isinstance(target_base_layer, bnb.nn.Linear8bitLt):
+            eightbit_kwargs = kwargs.copy()
+            eightbit_kwargs.update(
+                {
+                    "has_fp16_weights": target.state.has_fp16_weights,
+                    "memory_efficient_backward": target.state.memory_efficient_backward,
+                    "threshold": target.state.threshold,
+                    "index": target.index,
+                }
+            )
+            new_module = Linear8bitLt(target, adapter_name, **eightbit_kwargs)
+
+        return new_module
+
+
+if is_bnb_4bit_available():
+
+    class Linear4bit(torch.nn.Module, LoraLayer):
+        # Lora implemented in a dense layer
+        def __init__(
+            self,
+            base_layer: torch.nn.Module,
+            adapter_name: str,
+            r: int = 0,
+            lora_alpha: int = 1,
+            lora_dropout: float = 0.0,
+            init_lora_weights: bool = True,
+            use_rslora: bool = False,
+            use_dora: bool = False,
+            use_mora: bool = False,
+            mora_type: int = 1,
+            **kwargs,
+        ) -> None:
+            super().__init__()
+            LoraLayer.__init__(self, base_layer)
+
+            if use_dora:
+                raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False")
+
+            self._active_adapter = adapter_name
+            self.update_layer(
+                adapter_name,
+                r,
+                lora_alpha=lora_alpha,
+                lora_dropout=lora_dropout,
+                init_lora_weights=init_lora_weights,
+                use_rslora=use_rslora,
+                use_dora=use_dora,
+                use_mora=use_mora,
+                mora_type=mora_type,
+            )
+
+        def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+            """
+            Merge the active adapter weights into the base weights
+
+            Args:
+                safe_merge (`bool`, *optional*):
+                    If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                    before merging the weights. This is useful if you want to check if the merge operation will produce
+                    NaNs. Defaults to `False`.
+                adapter_names (`List[str]`, *optional*):
+                    The list of adapter names that should be merged. If None, all active adapters will be merged.
+                    Defaults to `None`.
+            """
+            adapter_names = check_adapters_to_merge(self, adapter_names)
+            if not adapter_names:
+                # no adapter to merge
+                return
+
+            for active_adapter in adapter_names:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                warnings.warn(
+                    "Merge lora module to 4-bit linear may get different generations due to rounding errors."
+                )
+                # Refer to https://gist.github.com/ChrisHayduk/1a53463331f52dca205e55982baf9930
+                weight = self.get_base_layer().weight
+                kwargs = weight.__dict__
+                lora_data = self.get_delta_weight(active_adapter)
+
+                w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) + lora_data
+                if safe_merge and not torch.isfinite(w_data).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+                if "bnb_quantized" in kwargs:
+                    kwargs["bnb_quantized"] = False
+                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
+                    weight.device
+                )
+                self.merged_adapters.append(active_adapter)
+
+        def unmerge(self) -> None:
+            """
+            This method unmerges all merged adapter layers from the base weights.
+            """
+            if not self.merged:
+                warnings.warn("Already unmerged. Nothing to do.")
+                return
+
+            while len(self.merged_adapters) > 0:
+                active_adapter = self.merged_adapters.pop()
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                warnings.warn(
+                    "Unmerge lora module to 4-bit linear may get different generations due to rounding errors."
+                )
+                weight = self.get_base_layer().weight
+                kwargs = weight.__dict__
+                lora_data = self.get_delta_weight(active_adapter)
+                w_data = bnb.functional.dequantize_4bit(weight.data, weight.quant_state) - lora_data
+                if "bnb_quantized" in kwargs:
+                    kwargs["bnb_quantized"] = False
+                self.get_base_layer().weight = bnb.nn.Params4bit(w_data.to("cpu"), requires_grad=False, **kwargs).to(
+                    weight.device
+                )
+
+        def get_delta_weight(self, adapter):
+            return (
+                transpose(
+                    self.lora_B[adapter].weight @ self.lora_A[adapter].weight,
+                    False,
+                )
+                * self.scaling[adapter]
+            )
+
+        def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+            if self.disable_adapters:
+                if self.merged:
+                    self.unmerge()
+                result = self.base_layer(x, *args, **kwargs)
+            elif self.merged:
+                result = self.base_layer(x, *args, **kwargs)
+            else:
+                result = self.base_layer(x, *args, **kwargs)
+                # As per Tim Dettmers, for 4bit, we need to defensively clone here.
+                # The reason is that in some cases, an error can occur that backprop
+                # does not work on a manipulated view. This issue may be solved with
+                # newer PyTorch versions but this would need extensive testing to be
+                # sure.
+                result = result.clone()
+
+                for active_adapter in self.active_adapters:
+                    if active_adapter not in self.lora_A.keys():
+                        continue
+                    lora_A = self.lora_A[active_adapter]
+                    lora_B = self.lora_B[active_adapter]
+                    dropout = self.lora_dropout[active_adapter]
+                    scaling = self.scaling[active_adapter]
+
+                    requires_conversion = not torch.is_autocast_enabled()
+                    if requires_conversion:
+                        expected_dtype = result.dtype
+                        x = x.to(lora_A.weight.dtype)
+
+                    if self.use_mora[active_adapter]:
+                        x = dropout(x)
+                        output = self._apply_mora(x, lora_A, lora_B, scaling, active_adapter)
+                    else:
+                        output = lora_B(lora_A(dropout(x)))
+                    if requires_conversion:
+                        output = output.to(expected_dtype)
+                    output = output * scaling
+                    result = result + output
+
+            return result
+
+        def __repr__(self) -> str:
+            rep = super().__repr__()
+            return "lora." + rep
+
+    def dispatch_bnb_4bit(target: torch.nn.Module, adapter_name: str, **kwargs):
+        new_module = None
+
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        loaded_in_4bit = kwargs.get("loaded_in_4bit", False)
+        if loaded_in_4bit and is_bnb_4bit_available() and isinstance(target_base_layer, bnb.nn.Linear4bit):
+            fourbit_kwargs = kwargs.copy()
+            fourbit_kwargs.update(
+                {
+                    "compute_dtype": target_base_layer.compute_dtype,
+                    "compress_statistics": target_base_layer.weight.compress_statistics,
+                    "quant_type": target_base_layer.weight.quant_type,
+                }
+            )
+            new_module = Linear4bit(target, adapter_name, **fourbit_kwargs)
+
+        return new_module
diff --git a/MoRA/peft_mora/tuners/lora/config.py b/MoRA/peft_mora/tuners/lora/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe0d567efc0c65cced8008680542b27a8a53eb0e
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lora/config.py
@@ -0,0 +1,292 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Literal, Optional, Union
+
+from peft_mora.config import PeftConfig
+from peft_mora.utils import PeftType
+
+
+@dataclass
+class LoftQConfig:
+    """
+    This is the sub-configuration class to store the configuration of a [`LoraModel`].
+
+    Args:
+        bits_pattern (`dict`): The mapping from layer names or regexp expression to bits which are different from the
+            default bits specified by `bits`. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 2`}.
+        bits (`int`): Quantization bits for LoftQ.
+        iter (`int`): Alternating iterations for LoftQ.
+        fake (`bool`): True: use fp16/fp32; used for first time to save weights. False: use bitsandbytes 4bit linear
+            models. weights can't be saved. Recommend to set to True, save the weights and load the saved weights in 4
+            bits.
+    """
+
+    loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"})
+    loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"})
+
+
+@dataclass
+class LoraConfig(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a [`LoraModel`].
+
+    Args:
+        r (`int`):
+            Lora attention dimension (the "rank").
+        target_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
+            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
+            strings, either an exact match will be performed or it is checked if the name of the module ends with any
+            of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen,
+            excluding the output layer. If this is not specified, modules will be chosen according to the model
+            architecture. If the architecture is not known, an error will be raised -- in this case, you should specify
+            the target modules manually.
+        lora_alpha (`int`):
+            The alpha parameter for Lora scaling.
+        lora_dropout (`float`):
+            The dropout probability for Lora layers.
+        fan_in_fan_out (`bool`):
+            Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses
+            `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`.
+        bias (`str`):
+            Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the corresponding biases
+            will be updated during training. Be aware that this means that, even when disabling the adapters, the model
+            will not produce the same output as the base model would have without adaptation.
+        use_rslora (`bool`):
+            When set to True, uses <a href='https://doi.org/10.48550/arXiv.2312.03732'>Rank-Stabilized LoRA</a> which
+            sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it was proven to work better.
+            Otherwise, it will use the original default value of `lora_alpha/r`.
+        modules_to_save (`List[str]`):
+            List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
+        init_lora_weights (`bool` | `Literal["gaussian", "loftq"]`):
+            How to initialize the weights of the adapter layers. Passing True (default) results in the default
+            initialization from the reference implementation from Microsoft. Passing 'gaussian' results in Gaussian
+            initialization scaled by the LoRA rank for linear and layers. Setting the initialization to False leads to
+            completely random initialization and is discouraged. Pass `'loftq'` to use LoftQ initialization.
+        layers_to_transform (`Union[List[int], int]`):
+            The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
+            that are specified in this list. If a single integer is passed, it will apply the transformations on the
+            layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None`.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        alpha_pattern (`dict`):
+            The mapping from layer names or regexp expression to alphas which are different from the default alpha
+            specified by `lora_alpha`.
+        megatron_config (`Optional[dict]`):
+            The TransformerConfig arguments for Megatron. It is used to create LoRA's parallel linear layer. You can
+            get it like this, `core_transformer_config_from_args(get_args())`, these two functions being from Megatron.
+            The arguments will be used to initialize the TransformerConfig of Megatron. You need to specify this
+            parameter when you want to apply LoRA to the ColumnParallelLinear and RowParallelLinear layers of megatron.
+        megatron_core (`Optional[str]`):
+            The core module from Megatron to use, defaults to `"megatron.core"`.
+        loftq_config (`Optional[LoftQConfig]`):
+            The configuration of LoftQ. If this is not None, then LoftQ will be used to quantize the backbone weights
+            and initialize Lora layers. Also pass `init_lora_weights='loftq'`. Note that you should not pass a
+            quantized model in this case, as LoftQ will quantize the model itself.
+        use_dora (`bool`):
+            Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the weights
+            into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is
+            handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low
+            ranks. Right now, DoRA only supports non-quantized linear layers. DoRA introduces a bigger overhead than
+            pure LoRA, so it is recommended to merge weights for inference. For more information, see
+            https://arxiv.org/abs/2402.09353.
+    """
+
+    r: int = field(default=8, metadata={"help": "Lora attention dimension"})
+    target_modules: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={
+            "help": (
+                "List of module names or regex expression of the module names to replace with LoRA."
+                "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$'."
+                "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
+                "If not specified, modules will be chosen according to the model architecture, If the architecture is "
+                "not known, an error will be raised -- in this case, you should specify the target modules manually."
+            ),
+        },
+    )
+    lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"})
+    lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"})
+    fan_in_fan_out: bool = field(
+        default=False,
+        metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
+    )
+    bias: Literal["none", "all", "lora_only"] = field(
+        default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"}
+    )
+    use_rslora: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "When set to True, uses Rank-Stabilized LoRA doi.org/10.48550/arXiv.2312.03732"
+                " which sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it"
+                " was proven to work better. Otherwise, it will use the original default"
+                " value of `lora_alpha/r`."
+            )
+        },
+    )
+    modules_to_save: Optional[list[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+    init_lora_weights: bool | Literal["gaussian", "loftq"] = field(
+        default=True,
+        metadata={
+            "help": (
+                "How to initialize the weights of the LoRA layers. Passing True (default) results in the default "
+                "initialization from the reference implementation from Microsoft. Passing 'gaussian' results "
+                "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization "
+                "to False leads to completely random initialization and is discouraged."
+                "Pass `'loftq'` to use LoftQ initialization"
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[list[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. "
+            "This only works when target_modules is a list of str."
+        },
+    )
+    layers_pattern: Optional[Union[list[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+            "This only works when target_modules is a list of str."
+        },
+    )
+    rank_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
+            )
+        },
+    )
+    alpha_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
+            )
+        },
+    )
+    megatron_config: Optional[dict] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The TransformerConfig from Megatron. It is used to create LoRA's parallel linear layer."
+                "You can get it like this, `core_transformer_config_from_args(get_args())`, "
+                "these two functions being from Megatron."
+                "You need to specify this parameter when you want to apply LoRA to the ColumnParallelLinear and "
+                "RowParallelLinear layers of megatron."
+                "It should be noted that we may not be able to use the `save_pretrained` and `from_pretrained` "
+                "functions, because TransformerConfig may not necessarily be serialized."
+                "But when using megatron, we can use `get_peft_model_state_dict` function and "
+                "megatron's framework, they can also save and load models and configurations."
+            )
+        },
+    )
+    megatron_core: Optional[str] = field(
+        default="megatron.core",
+        metadata={
+            "help": (
+                "The core module from Megatron, it is used to create LoRA's parallel linear layer. "
+                "It only needs to be passed in when you need to use your own modified megatron core module. "
+                "Otherwise, it will use the default value `megatron.core`. "
+            )
+        },
+    )
+    # dict type is used when loading config.json
+    loftq_config: Union[LoftQConfig, dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The configuration of LoftQ. If this is passed, then LoftQ will be used to quantize the backbone "
+                "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` in this case."
+            )
+        },
+    )
+    use_dora: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the "
+                "weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the "
+                "magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, "
+                "especially at low ranks. Right now, DoRA only supports non-quantized linear layers. DoRA introduces "
+                "a bigger overhead than pure LoRA, so it is recommended to merge weights for inference. For more "
+                "information, see  https://arxiv.org/abs/2402.09353."
+            )
+        },
+    )
+    use_mora: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Enable MoRA"
+            )
+        },
+    )
+
+    mora_type: int = field(
+        default=1,
+        metadata={
+            "help": (
+                "Enable MoRA"
+            )
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.LORA
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
+        # if target_modules is a regex expression, then layers_to_transform should be None
+        if isinstance(self.target_modules, str) and self.layers_to_transform is not None:
+            raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.")
+
+        # if target_modules is a regex expression, then layers_pattern should be None
+        if isinstance(self.target_modules, str) and self.layers_pattern is not None:
+            raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.")
+
+        if self.use_dora and (self.megatron_config or self.init_lora_weights == "loftq"):
+            raise ValueError("DoRA does not support megatron_core or LoftQ. Please set `use_dora=False`.")
+
+        # handle init_lora_weights and loftq_config
+        if self.init_lora_weights == "loftq":
+            import importlib
+
+            if not importlib.util.find_spec("scipy"):
+                raise ImportError("The required package 'scipy' is not installed. Please install it to continue.")
+            if self.loftq_config is None:
+                raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.")
+
+        # convert loftq_config to dict
+        if self.loftq_config and not isinstance(self.loftq_config, dict):
+            self.loftq_config = vars(self.loftq_config)
diff --git a/MoRA/peft_mora/tuners/lora/gptq.py b/MoRA/peft_mora/tuners/lora/gptq.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee36adda1d7c734649600422b608808f077a0834
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lora/gptq.py
@@ -0,0 +1,114 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional
+
+import torch
+
+from peft_mora.tuners.lora.layer import LoraLayer
+from peft_mora.tuners.tuners_utils import BaseTunerLayer
+from peft_mora.utils import get_auto_gptq_quant_linear
+
+
+class QuantLinear(torch.nn.Module, LoraLayer):
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        init_lora_weights: bool = True,
+        use_rslora: bool = False,
+        use_dora: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        LoraLayer.__init__(self, base_layer)
+
+        if use_dora:
+            raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False")
+
+        # self.base_layer and self.quant_linear_module are the same; we need the former for consistency and the latter
+        # for backwards compatibility
+        self.quant_linear_module = base_layer
+        self._active_adapter = adapter_name
+        self.update_layer(
+            adapter_name,
+            r,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            init_lora_weights=init_lora_weights,
+            use_rslora=use_rslora,
+            use_dora=use_dora,
+        )
+
+    def forward(self, x: torch.Tensor):
+        # note: logic differs from default Linear because merging is not supported
+        result = self.quant_linear_module(x)
+
+        if self.disable_adapters:
+            return result
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                x = x.to(lora_A.weight.dtype)
+
+            output = lora_B(lora_A(dropout(x)))
+            if requires_conversion:
+                output = output.to(expected_dtype)
+            output = output * scaling
+            result += output
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
+
+    # TODO: Check if it is better as suggested by users https://github.com/PanQiWei/AutoGPTQ/pull/102
+    # def reset_lora_parameters(self, adapter_name):
+    #     if adapter_name in self.lora_A.keys():
+    #         torch.nn.init.xavier_uniform_(self.lora_A[adapter_name].weight)
+    #         torch.nn.init.zeros_(self.lora_B[adapter_name].weight)
+
+
+def dispatch_gptq(
+    target: torch.nn.Module,
+    adapter_name: str,
+    **kwargs: Any,
+) -> Optional[torch.nn.Module]:
+    new_module = None
+
+    if isinstance(target, BaseTunerLayer):
+        target_base_layer = target.get_base_layer()
+    else:
+        target_base_layer = target
+
+    gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
+    AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
+
+    if AutoGPTQQuantLinear is not None and isinstance(target_base_layer, AutoGPTQQuantLinear):
+        new_module = QuantLinear(target, adapter_name, **kwargs)
+        target.qweight = target_base_layer.qweight
+
+    return new_module
diff --git a/MoRA/peft_mora/tuners/lora/layer.py b/MoRA/peft_mora/tuners/lora/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b1423fefce5804cc7025923b7d6b9c0e6dda29
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lora/layer.py
@@ -0,0 +1,1096 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import Any, List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.pytorch_utils import Conv1D
+
+from peft_mora.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge
+from peft_mora.utils.integrations import gather_params_ctx
+from peft_mora.utils.other import transpose
+
+from .config import LoraConfig
+
+
+class LoraLayer(BaseTunerLayer):
+    # All names of layers that may contain (trainable) adapter weights
+    adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B")
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")
+
+    def __init__(self, base_layer: nn.Module, **kwargs) -> None:
+        self.base_layer = base_layer
+        self.r = {}
+        self.lora_alpha = {}
+        self.scaling = {}
+        self.lora_dropout = nn.ModuleDict({})
+        self.lora_A = nn.ModuleDict({})
+        self.lora_B = nn.ModuleDict({})
+        # For Embedding layer
+        self.lora_embedding_A = nn.ParameterDict({})
+        self.lora_embedding_B = nn.ParameterDict({})
+        # Mark the weight as unmerged
+        self._disable_adapters = False
+        self.merged_adapters = []
+        self.use_dora: dict[str, bool] = {}
+        self.lora_magnitude_vector: Optional[torch.nn.ParameterDict] = None  # for DoRA
+        self._caches: dict[str, Any] = {}
+        self.kwargs = kwargs
+
+        self.use_mora: dict[str, bool] = {}
+
+        self.mora_type: dict[str, int] = {}
+
+
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        elif isinstance(base_layer, nn.Conv2d):
+            in_features, out_features = base_layer.in_channels, base_layer.out_channels
+        elif isinstance(base_layer, nn.Embedding):
+            in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim
+        elif isinstance(base_layer, Conv1D):
+            in_features, out_features = (
+                base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape
+            )
+        elif hasattr(base_layer, "infeatures") and hasattr(base_layer, "outfeatures"):
+            # QuantLinear
+            in_features, out_features = base_layer.infeatures, base_layer.outfeatures
+        elif hasattr(base_layer, "input_size") and hasattr(base_layer, "output_size"):
+            # Megatron ColumnParallelLinear,RowParallelLinear
+            in_features, out_features = base_layer.input_size, base_layer.output_size
+        elif hasattr(base_layer, "codebooks") and base_layer.__class__.__name__ == "QuantizedLinear":
+            # AQLM QuantLinear
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        elif hasattr(base_layer, "w_bit") and base_layer.__class__.__name__ == "WQLinear_GEMM":
+            # Awq layers
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        else:
+            raise ValueError(f"Unsupported layer type {type(base_layer)}")
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+    def update_layer(
+        self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora: bool = False,
+        use_mora: bool = False, mora_type: int = 1,
+    ):
+        # This code works for linear layers, override for other layer types
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer}))
+
+        self.use_mora[adapter_name] = False
+        self.mora_type[adapter_name] = mora_type
+
+        if use_mora:
+            new_r = int(math.sqrt((self.in_features + self.out_features)*r)+0.5)
+            if mora_type == 6:
+                # type 6 require new_r to be even for RoPE
+                new_r = new_r//2*2
+                            
+            self.lora_A[adapter_name] = nn.Linear(new_r, new_r, bias=False)
+            self.r[adapter_name] = new_r
+
+            nn.init.zeros_(self.lora_A[adapter_name].weight)
+            self.lora_B[adapter_name] = self.lora_A[adapter_name]
+            self.use_mora[adapter_name] = True
+            self.scaling[adapter_name] = 1.0
+        else:
+            # Actual trainable parameters
+            self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False)
+            self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=False)
+            if use_rslora:
+                self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
+            else:
+                self.scaling[adapter_name] = lora_alpha / r
+
+            if init_lora_weights == "loftq":
+                self.loftq_init(adapter_name)
+            elif init_lora_weights:
+                self.reset_lora_parameters(adapter_name, init_lora_weights)
+
+            # check weight and qweight (for GPTQ)
+            for weight_name in ("weight", "qweight"):
+                weight = getattr(self.get_base_layer(), weight_name, None)
+                if weight is not None:
+                    # the layer is already completely initialized, this is an update
+                    if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                        self.to(weight.device, dtype=weight.dtype)
+                    else:
+                        self.to(weight.device)
+                    break
+
+        if use_dora:
+            self.dora_init(adapter_name)
+            self.use_dora[adapter_name] = True
+        else:
+            self.use_dora[adapter_name] = False
+
+        self.set_adapter(self.active_adapters)
+
+    def reset_lora_parameters(self, adapter_name, init_lora_weights, mora_type=None):
+        if init_lora_weights is False:
+            return
+
+        if self.use_mora[adapter_name]:
+            nn.init.zeros_(self.lora_A[adapter_name].weight)
+            self.lora_B[adapter_name] = self.lora_A[adapter_name]
+            if mora_type is not None:
+                self.mora_type[adapter_name] = mora_type
+            return
+
+        if adapter_name in self.lora_A.keys():
+            if init_lora_weights is True:
+                # initialize A the same way as the default for nn.Linear and B to zero
+                # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124
+                nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5))
+            elif init_lora_weights.lower() == "gaussian":
+                nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name])
+            else:
+                raise ValueError(f"Unknown initialization {init_lora_weights=}")
+            nn.init.zeros_(self.lora_B[adapter_name].weight)
+        if adapter_name in self.lora_embedding_A.keys():
+            # initialize a the same way as the default for nn.linear and b to zero
+            nn.init.zeros_(self.lora_embedding_A[adapter_name])
+            nn.init.normal_(self.lora_embedding_B[adapter_name])
+
+    def loftq_init(self, adapter_name):
+        from peft_mora.utils.loftq_utils import loftq_init
+
+        weight = self.get_base_layer().weight
+        kwargs = {
+            "num_bits": self.kwargs.get("loftq_bits", 4),
+            "reduced_rank": self.r[adapter_name],
+            "num_iter": self.kwargs.get("loftq_iter", 1),
+        }
+
+        qweight, lora_A, lora_B = loftq_init(weight, **kwargs)
+        if adapter_name in self.lora_A.keys():
+            # initialize A the same way as the default for nn.Linear and B to zero
+            self.lora_A[adapter_name].weight.data = lora_A
+            self.lora_B[adapter_name].weight.data = lora_B
+        if adapter_name in self.lora_embedding_A.keys():
+            # initialize a the same way as the default for nn.linear and b to zero
+            self.lora_embedding_A[adapter_name].weight.data = lora_A
+            self.lora_embedding_B[adapter_name].weight.data = lora_B
+        self.get_base_layer().weight.data = qweight
+
+    def _get_weight_norm(self, weight, lora_weight, scaling) -> torch.Tensor:
+        # calculate L2 norm of weight matrix, column-wise
+        weight = weight + scaling * lora_weight
+        weight_norm = torch.linalg.norm(weight, dim=1)
+        return weight_norm
+
+    def dora_init(self, adapter_name: str) -> None:
+        lora_A = self.lora_A[adapter_name]
+        lora_B = self.lora_B[adapter_name]
+        scaling = self.scaling[adapter_name]
+        with gather_params_ctx(self.get_base_layer()):
+            weight = self.get_base_layer().weight
+            lora_weight = lora_B.weight @ lora_A.weight
+            weight_norm = self._get_weight_norm(weight, lora_weight, scaling)
+        self.lora_magnitude_vector = nn.ParameterDict()
+        self.lora_magnitude_vector[adapter_name] = nn.Parameter(weight_norm, requires_grad=True)
+        # add lora_magnitude_vector to the list of learnable parameters
+        self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",)
+
+    def _cache_store(self, key: str, value: Any) -> None:
+        self._caches[key] = value
+
+    def _cache_pop(self, key: str) -> Any:
+        value = self._caches.pop(key)
+        return value
+
+    def _apply_mora(self, x, lora_A, lora_B, scaling, active_adapter):
+        in_f, out_f = self.in_features, self.out_features
+        r = self.r[active_adapter]
+        if active_adapter in self.mora_type:
+            mora_type = self.mora_type[active_adapter]
+        else:
+            mora_type = 1
+
+        if mora_type == 1 or mora_type == 4:
+            sum_inter = in_f // r
+            if in_f % r != 0:
+                pad_size = r - in_f % r
+                # x = torch.cat([x, torch.zeros_like(x)[..., :pad_size]], dim=-1)
+                x = torch.cat([x, x[..., :pad_size]], dim=-1)
+                sum_inter += 1
+            in_x = x.view(*x.shape[:-1], sum_inter, r).sum(dim=-2)
+        elif mora_type == 2 or mora_type == 3:
+            mr, nr = in_f//r+1, in_f//r
+            m, n = in_f - r*nr, r*mr - in_f
+            mm, nn = m*mr, n * nr
+            if m > 0:
+                x_m, x_n = x[..., :mm], x[..., mm:]
+                x_m = x_m.view(*x.shape[:-1], m, mr).sum(dim=-1)
+                x_n = x_n.view(*x.shape[:-1], n, nr).sum(dim=-1)
+                in_x = torch.cat([x_m, x_n ], dim=-1)
+            else:
+                in_x = x.view(*x.shape[:-1], n, nr).sum(dim=-1)
+        elif mora_type == 6:
+            sum_inter = in_f // r
+            rb1 = in_f//r if in_f % r == 0 else in_f//r + 1
+            if in_f % r != 0:
+                pad_size = r - in_f % r
+                x = torch.cat([x, x[..., :pad_size]], dim=-1)
+                sum_inter += 1
+            in_x = x.view(*x.shape[:-1], sum_inter, r)
+            if not hasattr(self, 'cos') and not hasattr(self, 'sin'):
+                inv_freq = 1.0 / (10000 ** (torch.arange(0, r, 2).float() / r))
+                t = torch.arange(rb1)
+                freqs = torch.outer(t, inv_freq)
+                emb = torch.cat((freqs, freqs), dim=-1)
+                self.cos = emb.cos().unsqueeze(0).to(x.device).to(x.dtype)
+                self.sin = emb.sin().unsqueeze(0).to(x.device).to(x.dtype)
+            rh_in_x = torch.cat((-in_x[..., r//2:], in_x[..., :r//2]), dim=-1)
+            in_x = in_x*self.cos + rh_in_x*self.sin
+
+
+        out_x = lora_A(in_x)
+
+        if mora_type == 1 or mora_type == 3:
+            repeat_time = out_f // r
+            if out_f % r != 0:
+                repeat_time += 1
+            out_x = torch.cat([out_x]*repeat_time, dim=-1)[..., :out_f]
+        elif mora_type == 2 or mora_type == 4:
+            mr, nr = out_f//r+1, out_f//r
+            m, n = out_f - r*nr, r*mr - out_f
+            mm, nn = m*mr, n * nr
+            if m > 0:
+                out_x = torch.cat([torch.repeat_interleave(out_x[..., :m], mr, dim=-1),
+                                   torch.repeat_interleave(out_x[..., m:], nr, dim=-1)]
+                                  , dim=-1)
+            else:
+                out_x = torch.repeat_interleave(out_x, nr, dim=-1)
+        elif mora_type == 6:
+            out_x = out_x.view(*x.shape[:-1], -1)[..., :out_f]
+            if out_x.shape[-1] < out_f:
+                repeat_time = out_f // out_x.shape[-1]
+                if out_f % out_x.shape[-1] != 0:
+                    repeat_time += 1
+                out_x = torch.cat([out_x]*repeat_time, dim=-1)[..., :out_f]
+
+        return out_x
+
+    def _apply_dora(self, x, lora_A, lora_B, scaling, active_adapter):
+        """
+        For DoRA, calculate the extra output from LoRA with DoRA applied. This should be added on top of the base layer
+        output.
+        """
+        lora_weight = lora_B.weight @ lora_A.weight
+        magnitude = self.lora_magnitude_vector[active_adapter]
+        weight = self.get_base_layer().weight
+        weight_norm = self._get_weight_norm(weight, lora_weight, scaling)
+        # see section 4.3 of DoRA (https://arxiv.org/abs/2402.09353)
+        # "[...] we suggest treating ||V +∆V ||_c in
+        # Eq. (5) as a constant, thereby detaching it from the gradient
+        # graph. This means that while ||V + ∆V ||_c dynamically
+        # reflects the updates of ∆V , it won’t receive any gradient
+        # during backpropagation"
+        weight_norm = weight_norm.detach()
+        mag_norm_scale = (magnitude / weight_norm).view(1, -1)
+        result_dora = (mag_norm_scale - 1) * (
+            F.linear(x, transpose(weight, self.fan_in_fan_out))
+        ) + mag_norm_scale * lora_B(lora_A(x)) * scaling
+
+        # Note: Computation could potentially be accelerated by using the code below instead of calculating X@W again.
+        # This is only correct if dropout=0, otherwise results will differ:
+        # https://github.com/huggingface/peft/pull/1474#issuecomment-1964682771
+        # bias = self.get_base_layer().bias
+        # if bias is not None:
+        #     result = result - bias
+        # result = mag_norm_scale * result + mag_norm_scale * lora_B(lora_A(x)) * scaling
+        # if bias is not None:
+        #     result = result + bias
+
+        return result_dora
+
+    def set_scale(self, adapter, scale):
+        if adapter not in self.scaling:
+            # Ignore the case where the adapter is not in the layer
+            return
+        self.scaling[adapter] = scale * self.lora_alpha[adapter] / self.r[adapter]
+
+    def scale_layer(self, scale: float) -> None:
+        if scale == 1:
+            return
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+
+            self.scaling[active_adapter] *= scale
+
+    def unscale_layer(self, scale=None) -> None:
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+
+            if scale is None:
+                self.scaling[active_adapter] = self.lora_alpha[active_adapter] / self.r[active_adapter]
+            else:
+                self.scaling[active_adapter] /= scale
+
+
+# Below code is based on https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# and modified to work with PyTorch FSDP
+
+
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+
+
+class Linear(nn.Module, LoraLayer):
+    # Lora implemented in a dense layer
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,  # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
+        is_target_conv_1d_layer: bool = False,
+        init_lora_weights: Union[bool, str] = True,
+        use_rslora: bool = False,
+        use_dora: bool = False,
+        use_mora: bool = False,
+        mora_type: int = 1,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        LoraLayer.__init__(self, base_layer, **kwargs)
+        self.fan_in_fan_out = fan_in_fan_out
+
+        self._active_adapter = adapter_name
+        self.update_layer(
+            adapter_name,
+            r,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            init_lora_weights=init_lora_weights,
+            use_rslora=use_rslora,
+            use_dora=use_dora,
+            use_mora=use_mora,
+            mora_type=mora_type,
+        )
+        self.is_target_conv_1d_layer = is_target_conv_1d_layer
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.lora_A.keys():
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = base_layer.weight.data.clone()
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    if not self.use_dora[active_adapter]:
+                        orig_weights += delta_weight
+                    else:
+                        # handle dora
+                        # since delta_weight already includes scaling, set it to 1 here
+                        weight_norm = self._get_weight_norm(orig_weights, delta_weight, scaling=1).detach()
+                        # We need to cache weight_norm because it has to be based on the original weights. We
+                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
+                        # different value
+                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
+                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                        orig_weights = dora_factor.view(-1, 1) * (orig_weights + delta_weight)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = orig_weights
+                else:
+                    delta_weight = self.get_delta_weight(active_adapter)
+                    if not self.use_dora[active_adapter]:
+                        base_layer.weight.data += delta_weight
+                    else:
+                        # handle dora
+                        # since delta_weight already includes scaling, set it to 1 here
+                        weight_norm = self._get_weight_norm(base_layer.weight, delta_weight, scaling=1).detach()
+                        # We need to cache weight_norm because it has to be based on the original weights. We
+                        # cannot calculate it on the fly based on the merged weights when unmerging because its a
+                        # different value
+                        self._cache_store(f"{active_adapter}-weight_norm", weight_norm)
+                        dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                        new_weight = dora_factor.view(-1, 1) * (base_layer.weight.data + delta_weight)
+                        base_layer.weight.data = new_weight
+
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.lora_A.keys():
+                weight = self.get_base_layer().weight
+                delta_weight = self.get_delta_weight(active_adapter)
+                if not self.use_dora[active_adapter]:
+                    weight.data -= delta_weight
+                else:
+                    weight_norm = self._cache_pop(f"{active_adapter}-weight_norm")
+                    dora_factor = self.lora_magnitude_vector[active_adapter] / weight_norm
+                    weight_orig = weight.data / dora_factor.view(-1, 1) - delta_weight
+                    weight.data = weight_orig
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        """
+        Compute the delta weight for the given adapter.
+
+        Args:
+            adapter (str):
+                The name of the adapter for which the delta weight should be computed.
+        """
+        device = self.lora_B[adapter].weight.device
+        dtype = self.lora_B[adapter].weight.dtype
+
+        # In case users wants to merge the adapter weights that are in
+        # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
+        # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16.
+        cast_to_fp32 = device.type == "cpu" and dtype == torch.float16
+
+        weight_A = self.lora_A[adapter].weight
+        weight_B = self.lora_B[adapter].weight
+
+        if cast_to_fp32:
+            weight_A = weight_A.float()
+            weight_B = weight_B.float()
+
+        if self.use_mora[adapter]:
+            in_f, out_f = self.in_features, self.out_features
+            r = self.r[adapter]
+            if in_f % r != 0:
+                pad_size = r - in_f % r
+            else:
+                pad_size = 0
+            repeat_time = out_f // r
+            if out_f % r != 0:
+                repeat_time += 1
+
+            if adapter not in self.mora_type or self.mora_type[adapter] == 1:
+                w = torch.zeros(r, in_f).to(device, dtype=dtype)
+                aw = weight_A
+                for i in range(in_f + pad_size):
+                    w[:, i % in_f] += aw[:, i % r]
+                w = torch.cat([w]*repeat_time, dim=0)[:out_f]
+            elif self.mora_type[adapter] == 2:
+                w = weight_A
+                mr, nr = in_f//r+1, in_f//r
+                m, n = in_f - r*nr, r*mr - in_f
+
+                mm, nn = m*mr, n * nr
+                w = torch.cat([torch.repeat_interleave(w[:, :m], mr, dim=1),
+                            torch.repeat_interleave(w[:, m:], nr, dim=1)], dim=1)
+
+                mr, nr = out_f//r+1, out_f//r
+                m, n = out_f - r*nr, r*mr - out_f
+                mm, nn = m*mr, n * nr
+                w = torch.cat([torch.repeat_interleave(w[:m], mr, dim=0),
+                            torch.repeat_interleave(w[m:], nr, dim=0)], dim=0)
+            elif self.mora_type[adapter] == 3:
+                w = weight_A
+                mr, nr = in_f//r+1, in_f//r
+                m, n = in_f - r*nr, r*mr - in_f
+                mm, nn = m*mr, n * nr
+                w = torch.cat([torch.repeat_interleave(w[:, :m], mr, dim=1),
+                            torch.repeat_interleave(w[:, m:], nr, dim=1)], dim=1)
+
+                w = torch.cat([w]*repeat_time, dim=0)[:out_f]
+            elif self.mora_type[adapter] == 4:
+                w = torch.zeros(r, in_f).to(device, dtype=dtype)
+                aw = weight_A
+                for i in range(in_f + pad_size):
+                    w[:, i % in_f] += aw[:, i % r]
+
+                mr, nr = out_f//r+1, out_f//r
+                m, n = out_f - r*nr, r*mr - out_f
+                mm, nn = m*mr, n * nr
+                w = torch.cat([torch.repeat_interleave(w[:m], mr, dim=0),
+                            torch.repeat_interleave(w[m:], nr, dim=0)], dim=0)
+            elif self.mora_type[adapter] == 6:
+                w = torch.zeros(in_f+pad_size, in_f).to(device, dtype=dtype)
+                rb1 = in_f//r if in_f % r == 0 else in_f//r + 1
+                rb2 = out_f//r if out_f % r == 0 else out_f//r + 1
+                sum_inter, repeat_time = rb1, rb2
+                if not hasattr(self, 'cos') and not hasattr(self, 'sin'):
+                    inv_freq = 1.0 / (10000 ** (torch.arange(0, r, 2).float() / r))
+                    t = torch.arange(rb1)
+                    freqs = torch.outer(t, inv_freq)
+                    emb = torch.cat((freqs, freqs), dim=-1)
+                    self.cos = emb.cos().unsqueeze(0).to(w.device).to(w.dtype)
+                    self.sin = emb.sin().unsqueeze(0).to(w.device).to(w.dtype)
+                cos, sin = self.cos, self.sin
+                aw = weight_A
+                aw2 = torch.cat((aw[:, r//2:], -aw[:, :r//2]), dim=-1)
+                for i in range(sum_inter-1):
+                    w[i*r:(i+1)*r, i*r:(i+1)*r] = aw2*sin[:, i] + aw*cos[:, i]
+                i+=1
+                w[i*r:, i*r:]  = (aw2*sin[:, i] + aw*cos[:, i])[:, :r-pad_size] #+ aw2*sin[:, i])[:, :r-pad_size]
+                if pad_size > 0:
+                    w[i*r:, :pad_size] = (aw2*sin[:, i] + aw*cos[:, i])[:, r-pad_size:]
+                if in_f < out_f:
+                    w = torch.cat([w]*repeat_time, dim=0)[:out_f]
+                else:
+                    w = w[:out_f]
+            else:
+                # old
+                w = torch.zeros(r, in_f).to(device, dtype=dtype)
+                aw = weight_A
+                for i in range(in_f):
+                    w[:, i % in_f] += aw[:, i % r]
+                #w = torch.cat([w]*repeat_time, dim=0)[:out_f]
+                w = torch.cat([torch.repeat_interleave(w, out_f//r, dim=0), w], dim=0)[:out_f]
+            output_tensor = w
+        else:
+            output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter]
+
+        if cast_to_fp32:
+            output_tensor = output_tensor.to(dtype=dtype)
+
+            # cast back the weights
+            self.lora_A[adapter].weight.data = weight_A.to(dtype)
+            self.lora_B[adapter].weight.data = weight_B.to(dtype)
+
+        # print rank of output_tensor
+        # print(f'rank: {torch.linalg.matrix_rank(output_tensor.float())}')
+        return output_tensor
+
+    # use_mora_merge_ft = False
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        # elif hasattr(self, 'use_mora_merge_ft') and self.use_mora_merge_ft:
+        #     print('use_mora_merge_ft')
+        #     active_adapter = self.active_adapters[0]
+        #     ow = self.base_layer.weight.clone()
+        #     in_f, out_f = self.in_features, self.out_features
+        #     r = self.r[active_adapter]
+        #     pad_size = r - in_f % r if in_f % r != 0 else 0
+        #     repeat_time = out_f // r
+        #     if out_f % r != 0: repeat_time += 1
+        #     aw = self.lora_A[active_adapter].weight
+        #     w = torch.zeros(r, in_f).to(ow.device, dtype=ow.dtype)
+        #     for i in range(in_f + pad_size):
+        #         w[:, i % in_f] += aw[:, i % r]
+        #     w = torch.cat([w]*repeat_time, dim=0)[:out_f]
+        #     result = F.linear(x, ow+w, self.base_layer.bias)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            torch_result_dtype = result.dtype
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                x = x.to(lora_A.weight.dtype)
+
+                if self.use_mora[active_adapter]:
+                    # x = dropout(x)
+                    # delta = self._apply_mora(x, lora_A, lora_B, scaling, active_adapter)
+                    # print(delta.abs().mean().item())
+                    # with open('mora.txt', 'w') as f:
+                    #     print(delta.abs().mean().item(), file=f)
+                    # result = result + delta
+
+                    x = dropout(x)
+                    result = result + self._apply_mora(x, lora_A, lora_B, scaling, active_adapter)
+                elif not self.use_dora[active_adapter]:
+                    # delta = lora_B(lora_A(dropout(x))) * scaling
+                    # print(delta.abs().mean().item())
+                    # with open('lora.txt', 'w') as f:
+                    #     print(delta.abs().mean().item(), file=f)
+                    # result = result + delta
+
+                    result = result + lora_B(lora_A(dropout(x))) * scaling
+                else:
+                    x = dropout(x)
+                    result = result + self._apply_dora(x, lora_A, lora_B, scaling, active_adapter)
+
+            result = result.to(torch_result_dtype)
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
+
+
+class Embedding(nn.Module, LoraLayer):
+    # LoRA implemented in a Embedding layer
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        init_lora_weights: Union[bool, str] = True,
+        use_rslora: bool = False,
+        use_dora: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        LoraLayer.__init__(self, base_layer)
+
+        if use_dora:
+            raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False")
+
+        self._active_adapter = adapter_name
+        self.update_layer(
+            adapter_name,
+            r,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            init_lora_weights=init_lora_weights,
+            use_rslora=use_rslora,
+            use_dora=use_dora,
+        )
+
+    def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout[adapter_name] = lora_dropout_layer
+        # Actual trainable parameters
+        weight_A = torch.randn((r, self.in_features))
+        weight_B = torch.randn((self.out_features, r))
+        self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A)
+        self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B)
+        if use_rslora:
+            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
+        else:
+            self.scaling[adapter_name] = lora_alpha / r
+
+        if init_lora_weights == "loftq":
+            self.loftq_init(adapter_name)
+        elif init_lora_weights:
+            self.reset_lora_parameters(adapter_name, init_lora_weights)
+
+        base_layer = self.get_base_layer()
+        weight = getattr(base_layer, "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            self.to(base_layer.weight.device, dtype=weight.dtype)
+        self.set_adapter(self.active_adapters)
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.lora_embedding_A.keys():
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = base_layer.weight.data.clone()
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = orig_weights
+                else:
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.lora_embedding_A.keys():
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        """
+        Compute the delta weight for the given adapter.
+
+        Args:
+            adapter (str):
+                The name of the adapter for which the delta weight should be computed.
+        """
+        device = self.lora_embedding_B[adapter].device
+        dtype = self.lora_embedding_A[adapter].dtype
+
+        # In case users wants to merge the adapter weights that are in
+        # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
+        # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16.
+        cast_to_fp32 = device.type == "cpu" and dtype == torch.float16
+
+        weight_A = self.lora_embedding_A[adapter]
+        weight_B = self.lora_embedding_B[adapter]
+
+        if cast_to_fp32:
+            weight_A = weight_A.float()
+            weight_B = weight_B.float()
+
+        output_tensor = transpose(weight_B @ weight_A, True) * self.scaling[adapter]
+
+        if cast_to_fp32:
+            output_tensor = output_tensor.to(dtype=dtype)
+
+            # cast back the weights
+            self.lora_embedding_A[adapter] = weight_A.to(dtype)
+            self.lora_embedding_B[adapter] = weight_B.to(dtype)
+
+        return output_tensor
+
+    def _embed(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor:
+        base_layer = self.get_base_layer()
+        return F.embedding(
+            input,
+            weight,
+            padding_idx=base_layer.padding_idx,
+            max_norm=base_layer.max_norm,
+            norm_type=base_layer.norm_type,
+            scale_grad_by_freq=base_layer.scale_grad_by_freq,
+            sparse=base_layer.sparse,
+        )
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        # TODO: no dtype conversion here, unlike in Linear, is that correct?
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            torch_result_dtype = result.dtype
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_embedding_A:
+                    continue
+                embedding_A = self.lora_embedding_A[active_adapter].T
+                embedding_B = self.lora_embedding_B[active_adapter].T
+                scaling = self.scaling[active_adapter]
+                after_A = self._embed(x, embedding_A)
+                result += (after_A @ embedding_B) * scaling
+            result = result.to(torch_result_dtype)
+
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
+
+
+class Conv2d(nn.Module, LoraLayer):
+    # Lora implemented in a conv2d layer
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        init_lora_weights: Union[bool, str] = True,
+        use_rslora: bool = False,
+        use_dora: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        LoraLayer.__init__(self, base_layer)
+
+        if use_dora:
+            raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False")
+
+        self._active_adapter = adapter_name
+        self.update_layer(
+            adapter_name,
+            r,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            init_lora_weights=init_lora_weights,
+            use_rslora=use_rslora,
+            use_dora=use_dora,
+        )
+
+    def update_layer(self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout[adapter_name] = lora_dropout_layer
+        # Actual trainable parameters
+        base_layer = self.get_base_layer()
+        kernel_size = base_layer.kernel_size
+        stride = base_layer.stride
+        padding = base_layer.padding
+        self.lora_A[adapter_name] = nn.Conv2d(self.in_features, r, kernel_size, stride, padding, bias=False)
+        self.lora_B[adapter_name] = nn.Conv2d(r, self.out_features, (1, 1), (1, 1), bias=False)
+        if use_rslora:
+            self.scaling[adapter_name] = lora_alpha / math.sqrt(r)
+        else:
+            self.scaling[adapter_name] = lora_alpha / r
+
+        if init_lora_weights == "loftq":
+            self.loftq_init(adapter_name)
+        elif init_lora_weights:
+            self.reset_lora_parameters(adapter_name, init_lora_weights)
+
+        weight = getattr(base_layer, "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            self.to(base_layer.weight.device, dtype=weight.dtype)
+        self.set_adapter(self.active_adapters)
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights inside the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If True, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self.lora_A.keys():
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    # Note that safe_merge will be slower than the normal merge
+                    # because of the copy operation.
+                    orig_weights = base_layer.weight.data.clone()
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+                    base_layer.weight.data = orig_weights
+                else:
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self.lora_A.keys():
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
+
+    def get_delta_weight(self, adapter) -> torch.Tensor:
+        """
+        Compute the delta weight for the given adapter.
+
+        Args:
+            adapter (str):
+                The name of the adapter for which the delta weight should be computed.
+        """
+        device = self.lora_B[adapter].weight.device
+        dtype = self.lora_A[adapter].weight.dtype
+
+        # In case users wants to merge the adapter weights that are in
+        # float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to
+        # float16 because the `@` and matmul operation in general is not supported in torch + cpu + fp16.
+        cast_to_fp32 = device.type == "cpu" and dtype == torch.float16
+
+        weight_A = self.lora_A[adapter].weight
+        weight_B = self.lora_B[adapter].weight
+
+        if cast_to_fp32:
+            weight_A = weight_A.float()
+            weight_B = weight_B.float()
+
+        # https://github.com/bmaltais/kohya_ss/blob/feb6728762a8f463d15ba936d189d4c3abfaa1ab/networks/lora.py#L117
+        if self.get_base_layer().weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            output_tensor = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(
+                3
+            ) * self.scaling[adapter]
+        else:
+            # conv2d 3x3
+            output_tensor = (
+                F.conv2d(
+                    weight_A.permute(1, 0, 2, 3),
+                    weight_B,
+                ).permute(1, 0, 2, 3)
+                * self.scaling[adapter]
+            )
+
+        if cast_to_fp32:
+            output_tensor = output_tensor.to(dtype=dtype)
+
+            # cast back the weights
+            self.lora_A[adapter].weight.data = weight_A.to(dtype)
+            self.lora_B[adapter].weight.data = weight_B.to(dtype)
+
+        return output_tensor
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            torch_result_dtype = result.dtype
+
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                x = x.to(lora_A.weight.dtype)
+                result += lora_B(lora_A(dropout(x))) * scaling
+
+            result = result.to(torch_result_dtype)
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "lora." + rep
+
+
+def dispatch_default(
+    target: torch.nn.Module,
+    adapter_name: str,
+    lora_config: LoraConfig,
+    **kwargs,
+) -> Optional[torch.nn.Module]:
+    new_module = None
+
+    if isinstance(target, BaseTunerLayer):
+        target_base_layer = target.get_base_layer()
+    else:
+        target_base_layer = target
+
+    if isinstance(target_base_layer, torch.nn.Embedding):
+        embedding_kwargs = kwargs.copy()
+        embedding_kwargs.pop("fan_in_fan_out", None)
+        embedding_kwargs.update(lora_config.loftq_config)
+        new_module = Embedding(target, adapter_name, **embedding_kwargs)
+    elif isinstance(target_base_layer, torch.nn.Conv2d):
+        kwargs.update(lora_config.loftq_config)
+        new_module = Conv2d(target, adapter_name, **kwargs)
+    elif isinstance(target_base_layer, torch.nn.Linear):
+        if kwargs["fan_in_fan_out"]:
+            warnings.warn(
+                "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. "
+                "Setting fan_in_fan_out to False."
+            )
+            kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+        kwargs.update(lora_config.loftq_config)
+        new_module = Linear(target, adapter_name, **kwargs)
+    elif isinstance(target_base_layer, Conv1D):
+        if not kwargs["fan_in_fan_out"]:
+            warnings.warn(
+                "fan_in_fan_out is set to False but the target module is `Conv1D`. " "Setting fan_in_fan_out to True."
+            )
+            kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True
+        kwargs.update(lora_config.loftq_config)
+        new_module = Linear(target, adapter_name, is_target_conv_1d_layer=True, **kwargs)
+
+    return new_module
diff --git a/MoRA/peft_mora/tuners/lora/model.py b/MoRA/peft_mora/tuners/lora/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..179f4a8b1b49ae3b04ba30dba000176f6ad736ba
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lora/model.py
@@ -0,0 +1,730 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import math
+import operator
+import re
+import warnings
+from dataclasses import asdict, replace
+from enum import Enum
+from functools import reduce
+from itertools import chain
+from typing import Literal, Optional
+
+import torch
+from torch import nn
+from tqdm import tqdm
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+from peft_mora.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists, onload_layer
+from peft_mora.utils import (
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    _freeze_adapter,
+    _get_submodules,
+    get_quantization_config,
+)
+from peft_mora.utils.merge_utils import dare_linear, dare_ties, magnitude_prune, task_arithmetic, ties
+
+from .aqlm import dispatch_aqlm
+from .awq import dispatch_awq
+from .config import LoraConfig
+from .gptq import dispatch_gptq
+from .layer import Conv2d, LoraLayer, dispatch_default
+from .tp_layer import dispatch_megatron
+
+
+class LoraModel(BaseTuner):
+    """
+    Creates Low Rank Adapter (LoRA) model from a pretrained transformers model.
+
+    The method is described in detail in https://arxiv.org/abs/2106.09685.
+
+    Args:
+        model ([`torch.nn.Module`]): The model to be adapted.
+        config ([`LoraConfig`]): The configuration of the Lora model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The Lora model.
+
+    Example:
+
+        ```py
+        >>> from transformers import AutoModelForSeq2SeqLM
+        >>> from peft import LoraModel, LoraConfig
+
+        >>> config = LoraConfig(
+        ...     task_type="SEQ_2_SEQ_LM",
+        ...     r=8,
+        ...     lora_alpha=32,
+        ...     target_modules=["q", "v"],
+        ...     lora_dropout=0.01,
+        ... )
+
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> lora_model = LoraModel(model, config, "default")
+        ```
+
+        ```py
+        >>> import transformers
+        >>> from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_int8_training
+
+        >>> target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"]
+        >>> config = LoraConfig(
+        ...     r=4, lora_alpha=16, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
+        ... )
+
+        >>> model = transformers.GPTJForCausalLM.from_pretrained(
+        ...     "kakaobrain/kogpt",
+        ...     revision="KoGPT6B-ryan1.5b-float16",  # or float32 version: revision=KoGPT6B-ryan1.5b
+        ...     pad_token_id=tokenizer.eos_token_id,
+        ...     use_cache=False,
+        ...     device_map={"": rank},
+        ...     torch_dtype=torch.float16,
+        ...     load_in_8bit=True,
+        ... )
+        >>> model = prepare_model_for_int8_training(model)
+        >>> lora_model = get_peft_model(model, config)
+        ```
+
+    **Attributes**:
+        - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted.
+        - **peft_config** ([`LoraConfig`]): The configuration of the Lora model.
+    """
+
+    prefix: str = "lora_"
+
+    def __init__(self, model, config, adapter_name) -> None:
+        super().__init__(model, config, adapter_name)
+
+    def _check_new_adapter_config(self, config: LoraConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check
+        # does not fully correspond to the error message.
+        if (len(self.peft_config) > 1) and (config.bias != "none"):
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
+                "set bias to 'none' for all adapters."
+            )
+
+    @staticmethod
+    def _check_target_module_exists(lora_config, key):
+        return check_target_module_exists(lora_config, key)
+
+    def _create_and_replace(
+        self,
+        lora_config,
+        adapter_name,
+        target,
+        target_name,
+        parent,
+        current_key,
+    ):
+        if current_key is None:
+            raise ValueError("Current Key shouldn't be `None`")
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(chain(lora_config.rank_pattern.keys(), lora_config.alpha_pattern.keys()))
+        target_name_key = next(filter(lambda key: re.match(rf".*\.{key}$", current_key), pattern_keys), current_key)
+        r = lora_config.rank_pattern.get(target_name_key, lora_config.r)
+        alpha = lora_config.alpha_pattern.get(target_name_key, lora_config.lora_alpha)
+
+
+        kwargs = {
+            "r": r,
+            "lora_alpha": alpha,
+            "lora_dropout": lora_config.lora_dropout,
+            "fan_in_fan_out": lora_config.fan_in_fan_out,
+            "init_lora_weights": lora_config.init_lora_weights,
+            "use_rslora": lora_config.use_rslora,
+            "use_dora": lora_config.use_dora,
+            "use_mora": lora_config.use_mora,
+            "mora_type": lora_config.mora_type,
+            "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
+            "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
+        }
+
+
+        use_mora = lora_config.use_mora
+
+        quant_methods = ["gptq", "aqlm", "awq"]
+        for quant_method in quant_methods:
+            quantization_config = get_quantization_config(self.model, method=quant_method)
+            if quantization_config is not None:
+                kwargs[f"{quant_method}_quantization_config"] = quantization_config
+
+        # note: AdaLoraLayer is a subclass of LoraLayer, we need to exclude it
+        from peft_mora.tuners.adalora import AdaLoraLayer
+
+        if isinstance(target, LoraLayer) and not isinstance(target, AdaLoraLayer):
+            target.update_layer(
+                adapter_name,
+                r,
+                lora_alpha=alpha,
+                lora_dropout=lora_config.lora_dropout,
+                init_lora_weights=lora_config.init_lora_weights,
+                use_rslora=lora_config.use_rslora,
+                use_dora=lora_config.use_dora,
+                use_mora=use_mora,
+                mora_type=lora_config.mora_type,
+            )
+        else:
+            new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs)
+            if adapter_name != self.active_adapter:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if (self.prefix in name) or ("ranknum" in name):
+                weight = child.qweight if hasattr(child, "qweight") else child.weight
+                module.to(weight.device)
+
+    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+        for active_adapter in self.active_adapters:
+            bias = self.peft_config[active_adapter].bias
+            if bias == "none":
+                continue
+
+            if bias == "all":
+                for n, p in model.named_parameters():
+                    if "bias" in n:
+                        p.requires_grad = True
+            elif bias == "lora_only":
+                for m in model.modules():
+                    if isinstance(m, LoraLayer) and hasattr(m, "bias") and m.bias is not None:
+                        m.bias.requires_grad = True
+            else:
+                raise NotImplementedError(f"Requested bias: {bias}, is not implemented.")
+
+    @staticmethod
+    def _create_new_module(lora_config, adapter_name, target, **kwargs):
+        # Collect dispatcher functions to decide what backend to use for the replaced LoRA layer. The order matters,
+        # because the first match is always used. Therefore, the default layers should be checked last.
+        dispatchers = []
+
+        # avoid eager bnb import
+        if is_bnb_available():
+            from .bnb import dispatch_bnb_8bit
+
+            dispatchers.append(dispatch_bnb_8bit)
+
+        if is_bnb_4bit_available():
+            from .bnb import dispatch_bnb_4bit
+
+            dispatchers.append(dispatch_bnb_4bit)
+
+        dispatchers.extend([dispatch_aqlm, dispatch_awq, dispatch_gptq, dispatch_megatron, dispatch_default])
+
+        new_module = None
+        for dispatcher in dispatchers:
+            new_module = dispatcher(target, adapter_name, lora_config=lora_config, **kwargs)
+            if new_module is not None:  # first match wins
+                break
+
+        if new_module is None:
+            # no module could be matched
+            raise ValueError(
+                f"Target module {target} is not supported. Currently, only the following modules are supported: "
+                "`torch.nn.Linear`, `torch.nn.Embedding`, `torch.nn.Conv2d`, `transformers.pytorch_utils.Conv1D`."
+            )
+
+        return new_module
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+
+    def _set_adapter_layers(self, enabled: bool = True) -> None:
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self) -> None:
+        """Enable all adapters.
+
+        Call this if you have previously disabled all adapters and want to re-enable them.
+        """
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self) -> None:
+        """Disable all adapters.
+
+        When disabling all adapters, the model output corresponds to the output of the base model.
+        """
+        for active_adapter in self.active_adapters:
+            val = self.peft_config[active_adapter].bias
+            if val != "none":
+                msg = (
+                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
+                    "output as the the base model would without adaption."
+                )
+                warnings.warn(msg)
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name: str | list[str]) -> None:
+        """Set the active adapter(s).
+
+        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
+        not desired, use the following code.
+
+        ```py
+        >>> for name, param in model_peft.named_parameters():
+        ...     if ...:  # some check on name (ex. if 'lora' in name)
+        ...         param.requires_grad = False
+        ```
+
+        Args:
+            adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated.
+        """
+        for module in self.model.modules():
+            if isinstance(module, LoraLayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[list[str]] = None,
+    ):
+        if merge:
+            if getattr(self.model, "quantization_method", None) == "gptq":
+                raise ValueError("Cannot merge LORA layers when the model is gptq quantized")
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+            with onload_layer(target):
+                if hasattr(target, "base_layer"):
+                    if merge:
+                        target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                    self._replace_module(parent, target_name, target.get_base_layer(), target)
+                elif isinstance(target, ModulesToSaveWrapper):
+                    # save any additional trainable modules part of `modules_to_save`
+                    new_module = target.modules_to_save[target.active_adapter]
+                    if hasattr(new_module, "base_layer"):
+                        # check if the module is itself a tuner layer
+                        if merge:
+                            new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                        new_module = new_module.get_base_layer()
+                    setattr(parent, target_name, new_module)
+
+        return self.model
+
+    def add_weighted_adapter(
+        self,
+        adapters,
+        weights,
+        adapter_name,
+        combination_type="svd",
+        svd_rank=None,
+        svd_clamp=None,
+        svd_full_matrices=True,
+        svd_driver=None,
+        density=None,
+        majority_sign_method: Literal["total", "frequency"] = "total",
+    ) -> None:
+        """
+        This method adds a new adapter by merging the given adapters with the given weights.
+
+        When using the `cat` combination_type you should be aware that rank of the resulting adapter will be equal to
+        the sum of all adapters ranks. So it's possible that the mixed adapter may become too big and result in OOM
+        errors.
+
+        Args:
+            adapters (`list`):
+                List of adapter names to be merged.
+            weights (`list`):
+                List of weights for each adapter.
+            adapter_name (`str`):
+                Name of the new adapter.
+            combination_type (`str`):
+                The merging type can be one of [`svd`, `linear`, `cat`, `ties`, `ties_svd`, `dare_ties`, `dare_linear`,
+                `dare_ties_svd`, `dare_linear_svd`, `magnitude_prune`, `magnitude_prune_svd`]. When using the `cat`
+                combination_type, the rank of the resulting adapter is equal to the sum of all adapters ranks (the
+                mixed adapter may be too big and result in OOM errors).
+            svd_rank (`int`, *optional*):
+                Rank of output adapter for svd. If None provided, will use max rank of merging adapters.
+            svd_clamp (`float`, *optional*):
+                A quantile threshold for clamping SVD decomposition output. If None is provided, do not perform
+                clamping. Defaults to None.
+            svd_full_matrices (`bool`, *optional*):
+                Controls whether to compute the full or reduced SVD, and consequently, the shape of the returned
+                tensors U and Vh. Defaults to True.
+            svd_driver (`str`, *optional*):
+                Name of the cuSOLVER method to be used. This keyword argument only works when merging on CUDA. Can be
+                one of [None, `gesvd`, `gesvdj`, `gesvda`]. For more info please refer to `torch.linalg.svd`
+                documentation. Defaults to None.
+            density (`float`, *optional*):
+                Value between 0 and 1. 0 means all values are pruned and 1 means no values are pruned. Should be used
+                with [`ties`, `ties_svd`, `dare_ties`, `dare_linear`, `dare_ties_svd`, `dare_linear_svd`,
+                `magnintude_prune`, `magnitude_prune_svd`]
+            majority_sign_method (`str`):
+                The method, should be one of ["total", "frequency"], to use to get the magnitude of the sign values.
+                Should be used with [`ties`, `ties_svd`, `dare_ties`, `dare_ties_svd`]
+        """
+
+        if adapter_name in list(self.peft_config.keys()):
+            return
+        for adapter in adapters:
+            if adapter not in list(self.peft_config.keys()):
+                raise ValueError(f"Adapter {adapter} does not exist")
+
+        # if there is only one adapter, we can only use linear merging
+        combination_type = "linear" if len(adapters) == 1 else combination_type
+
+        adapters_ranks = [self.peft_config[adapter].r for adapter in adapters]
+        if combination_type in ("linear", "ties", "dare_ties", "dare_linear", "magnitude_prune"):
+            # all adapters ranks should be same, new rank is just this value
+            if len(set(adapters_ranks)) != 1:
+                raise ValueError(
+                    "All adapters must have the same r value when using combination_type linear, ties, dare_ties or dare_linear."
+                )
+            new_rank = adapters_ranks[0]
+        elif combination_type == "cat":
+            # adapters ranks may be different, new rank is sum of all ranks
+            # be careful, because output adapter rank may be really big if mixing a lot of adapters
+            new_rank = sum(adapters_ranks)
+        elif combination_type.endswith("svd"):
+            # new rank is the max of all ranks of the adapters if not provided
+            new_rank = svd_rank or max(adapters_ranks)
+        else:
+            raise ValueError(f"Invalid combination_type: {combination_type}")
+
+        target_module_types = [type(self.peft_config[adapter].target_modules) for adapter in adapters]
+        if not target_module_types:
+            raise ValueError(f"Found no adapter matching the names in {adapters}")
+        if len(set(target_module_types)) > 1:
+            raise ValueError(
+                "all adapter configs should follow the same target modules type. "
+                "Combining adapters with `target_modules` type being a mix of list/set and string is not supported."
+            )
+
+        if target_module_types[0] == str:
+            new_target_modules = "|".join(f"({self.peft_config[adapter].target_modules})" for adapter in adapters)
+        elif target_module_types[0] == set:
+            new_target_modules = reduce(
+                operator.or_, (self.peft_config[adapter].target_modules for adapter in adapters)
+            )
+        else:
+            raise TypeError(f"Invalid type {target_module_types[0]} found in target_modules")
+
+        self.peft_config[adapter_name] = replace(
+            self.peft_config[adapters[0]],
+            r=new_rank,
+            lora_alpha=new_rank,
+            target_modules=new_target_modules,
+        )
+        self.inject_adapter(self.model, adapter_name)
+
+        # Do we really need that?
+        _freeze_adapter(self.model, adapter_name)
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, LoraLayer):
+                if adapter_name in target.lora_A:
+                    target_lora_A = target.lora_A[adapter_name].weight
+                    target_lora_B = target.lora_B[adapter_name].weight
+                elif adapter_name in target.lora_embedding_A:
+                    target_lora_A = target.lora_embedding_A[adapter_name]
+                    target_lora_B = target.lora_embedding_B[adapter_name]
+                else:
+                    continue
+
+                target_lora_A.data = target_lora_A.data * 0.0
+                target_lora_B.data = target_lora_B.data * 0.0
+                if combination_type == "cat":
+                    loras_A, loras_B = [], []
+                    for adapter, weight in zip(adapters, weights):
+                        if adapter in target.lora_A:
+                            current_adapter_lora_A = target.lora_A[adapter].weight
+                            current_adapter_lora_B = target.lora_B[adapter].weight
+                        elif adapter in target.lora_embedding_A:
+                            current_adapter_lora_A = target.lora_embedding_A[adapter]
+                            current_adapter_lora_B = target.lora_embedding_B[adapter]
+                        else:
+                            continue
+                        loras_A.append(current_adapter_lora_A.data * weight * target.scaling[adapter])
+                        loras_B.append(current_adapter_lora_B.data)
+
+                    if len(loras_A) == 0:
+                        raise ValueError("No matching LoRAs found. Please raise an issue on GitHub.")
+                    loras_A = torch.cat(loras_A, dim=0)
+                    loras_B = torch.cat(loras_B, dim=1)
+                    target_lora_A.data[: loras_A.shape[0], :] = loras_A
+                    target_lora_B.data[:, : loras_B.shape[1]] = loras_B
+                elif combination_type in [
+                    "svd",
+                    "ties_svd",
+                    "dare_linear_svd",
+                    "dare_ties_svd",
+                    "magnitude_prune_svd",
+                ]:
+                    target_lora_A.data, target_lora_B.data = self._svd_generalized_task_arithmetic_weighted_adapter(
+                        combination_type,
+                        adapters,
+                        weights,
+                        new_rank,
+                        target,
+                        target_lora_A,
+                        target_lora_B,
+                        density,
+                        majority_sign_method,
+                        svd_clamp,
+                        full_matrices=svd_full_matrices,
+                        driver=svd_driver,
+                    )
+                elif combination_type in ["linear", "ties", "dare_linear", "dare_ties", "magnitude_prune"]:
+                    target_lora_A.data, target_lora_B.data = self._generalized_task_arithmetic_weighted_adapter(
+                        combination_type, adapters, weights, target, density, majority_sign_method
+                    )
+
+    def _svd_generalized_task_arithmetic_weighted_adapter(
+        self,
+        combination_type,
+        adapters,
+        weights,
+        new_rank,
+        target,
+        target_lora_A,
+        target_lora_B,
+        density,
+        majority_sign_method,
+        clamp=None,
+        full_matrices=True,
+        driver=None,
+    ):
+        valid_adapters = []
+        valid_weights = []
+        is_embedding = any(adapter in target.lora_embedding_A for adapter in adapters)
+        for adapter, weight in zip(adapters, weights):
+            if adapter in target.lora_A or adapter in target.lora_embedding_A:
+                valid_adapters.append(adapter)
+                valid_weights.append(weight * target.scaling[adapter])
+
+        # if no valid adapter, nothing to do
+        if len(valid_adapters) == 0:
+            raise ValueError("No matching LoRAs found. Please raise an issue on Github.")
+        delta_weight = [target.get_delta_weight(adapter) for adapter in valid_adapters]
+        valid_weights = torch.tensor(valid_weights).to(delta_weight[0].device)
+        if combination_type == "svd":
+            delta_weight = task_arithmetic(delta_weight, valid_weights)
+        elif combination_type == "ties_svd":
+            delta_weight = ties(delta_weight, valid_weights, density, majority_sign_method)
+        elif combination_type == "dare_linear_svd":
+            delta_weight = dare_linear(delta_weight, valid_weights, density)
+        elif combination_type == "dare_ties_svd":
+            delta_weight = dare_ties(delta_weight, valid_weights, density, majority_sign_method)
+        elif combination_type == "magnitude_prune_svd":
+            delta_weight = magnitude_prune(delta_weight, valid_weights, density)
+        else:
+            raise ValueError(f"Invalid value passed to combination type: {combination_type}")
+
+        conv2d = isinstance(target, Conv2d)
+        if conv2d:
+            conv2d_1x1 = target.weight.size()[2:4] == (1, 1)
+            if not conv2d_1x1:
+                delta_weight = delta_weight.flatten(start_dim=1)
+            else:
+                delta_weight = delta_weight.squeeze()
+        if (hasattr(target, "fan_in_fan_out") and target.fan_in_fan_out) or is_embedding:
+            delta_weight = delta_weight.T
+
+        # based on https://github.com/kohya-ss/sd-scripts/blob/main/networks/svd_merge_lora.py#L114-L131
+        U, S, Vh = torch.linalg.svd(delta_weight, full_matrices=full_matrices, driver=driver)
+        U = U[:, :new_rank]
+        S = S[:new_rank]
+        U = U @ torch.diag(S)
+        Vh = Vh[:new_rank, :]
+        if clamp is not None:
+            dist = torch.cat([U.flatten(), Vh.flatten()])
+            hi_val = torch.quantile(dist, clamp)
+            low_val = -hi_val
+            U = U.clamp(low_val, hi_val)
+            Vh = Vh.clamp(low_val, hi_val)
+        if conv2d:
+            U = U.reshape(target_lora_B.data.shape)
+            Vh = Vh.reshape(target_lora_A.data.shape)
+        return Vh, U
+
+    def _generalized_task_arithmetic_weighted_adapter(
+        self,
+        combination_type,
+        adapters,
+        weights,
+        target,
+        density,
+        majority_sign_method,
+    ):
+        # account weights for LoRA A and B layers.
+        valid_weights = []
+        lora_A_deltas = []
+        lora_B_deltas = []
+        for adapter, weight in zip(adapters, weights):
+            if adapter in target.lora_A:
+                current_adapter_lora_A = target.lora_A[adapter].weight
+                current_adapter_lora_B = target.lora_B[adapter].weight
+            elif adapter in target.lora_embedding_A:
+                current_adapter_lora_A = target.lora_embedding_A[adapter]
+                current_adapter_lora_B = target.lora_embedding_B[adapter]
+            else:
+                continue
+            valid_weights.append(math.sqrt(weight * target.scaling[adapter]))
+            lora_A_deltas.append(current_adapter_lora_A.data)
+            lora_B_deltas.append(current_adapter_lora_B.data)
+        valid_weights = torch.tensor(valid_weights).to(lora_A_deltas[0].device)
+        lora_deltas = [lora_A_deltas, lora_B_deltas]
+        dtype = lora_A_deltas[0].dtype
+        for i, task_tensors in enumerate(lora_deltas):
+            if combination_type == "linear":
+                lora_deltas[i] = task_arithmetic(task_tensors, valid_weights)
+            elif combination_type == "ties":
+                lora_deltas[i] = ties(task_tensors, valid_weights, density, majority_sign_method)
+            elif combination_type == "dare_linear":
+                lora_deltas[i] = dare_linear(task_tensors, valid_weights, density)
+            elif combination_type == "dare_ties":
+                lora_deltas[i] = dare_ties(task_tensors, valid_weights, density, majority_sign_method)
+            elif combination_type == "magnitude_prune":
+                lora_deltas[i] = magnitude_prune(task_tensors, valid_weights, density)
+            else:
+                raise ValueError("Invalid combination type")
+        lora_deltas = [delta.to(dtype) for delta in lora_deltas]
+        return lora_deltas
+
+    def delete_adapter(self, adapter_name: str) -> None:
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (str): Name of the adapter to be deleted.
+        """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, LoraLayer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
+
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
+    ) -> torch.nn.Module:
+        r"""
+        This method merges the LoRa layers into the base model. This is needed if someone wants to use the base model
+        as a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        Example:
+
+        ```py
+        >>> from transformers import AutoModelForCausalLM
+        >>> from peft import PeftModel
+
+        >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b")
+        >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample"
+        >>> model = PeftModel.from_pretrained(base_model, peft_model_id)
+        >>> merged_model = model.merge_and_unload()
+        ```
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+
+    def unload(self) -> torch.nn.Module:
+        """
+        Gets back the base model by removing all the lora modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
diff --git a/MoRA/peft_mora/tuners/lora/tp_layer.py b/MoRA/peft_mora/tuners/lora/tp_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..57ba0eef2114ae1bdc6d29e385d0b166620aab91
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lora/tp_layer.py
@@ -0,0 +1,239 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import warnings
+from typing import Any, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+
+from peft_mora.tuners.tuners_utils import BaseTunerLayer
+
+from .layer import LoraLayer
+
+
+class LoraParallelLinear(nn.Module, LoraLayer):
+    """
+    When the target layer parallel_linear is RowParallelLinear, in order to keep the input and output shapes
+    consistent, we need to split the lora matrix A into rows, and the lora_B at this time should be a complete linear
+    layer; In the same way, when the target layer is ColumnParallelLinear, we perform column segmentation on lora_B,
+    while lora_A is still a complete linear layer.
+    """
+
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        backend,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        fan_in_fan_out: bool = False,
+        init_lora_weights: bool = True,
+        use_rslora: bool = False,
+        use_dora: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        LoraLayer.__init__(self, base_layer=base_layer)
+
+        if use_dora:
+            raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False")
+
+        self.backend = backend
+        self.is_parallel_a = isinstance(base_layer, backend.RowParallelLinear)
+        self.fan_in_fan_out = fan_in_fan_out
+        self._active_adapter = adapter_name
+
+        megatron_config = kwargs["megatron_config"]
+        parallel_linear_kwargs = {"megatron_config": megatron_config}
+        init_method = init.xavier_normal_
+        if hasattr(megatron_config, "init_method"):
+            init_method = megatron_config.init_method
+        input_is_parallel = True
+        gather_output = False
+        if isinstance(base_layer, self.backend.RowParallelLinear):
+            input_is_parallel = base_layer.input_is_parallel
+        else:
+            gather_output = base_layer.gather_output
+        self.update_layer(
+            adapter_name,
+            r,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            init_lora_weights=init_lora_weights,
+            use_rslora=use_rslora,
+            use_dora=use_dora,
+            init_method=init_method,
+            input_is_parallel=input_is_parallel,
+            gather_output=gather_output,
+            **parallel_linear_kwargs,
+        )
+
+        self.is_target_conv_1d_layer = False
+
+    @property
+    def is_paralle_a(self):
+        # TODO: remove it in PEFT 0.10.0
+        # See https://github.com/huggingface/peft/pull/1439 for more details
+        warnings.warn(
+            "`is_paralle_a` is going to be deprecated in a future release. Please use `is_parallel_a`", FutureWarning
+        )
+        return self.is_parallel_a
+
+    def update_layer(
+        self,
+        adapter_name,
+        r,
+        lora_alpha,
+        lora_dropout,
+        init_lora_weights,
+        use_rslora,
+        use_dora=False,
+        init_method=init.xavier_normal_,
+        input_is_parallel=True,
+        gather_output=False,
+        **parallel_linear_kwargs,
+    ):
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+        self.r[adapter_name] = r
+        self.lora_alpha[adapter_name] = lora_alpha
+        if lora_dropout > 0.0:
+            lora_dropout_layer = nn.Dropout(p=lora_dropout)
+        else:
+            lora_dropout_layer = nn.Identity()
+
+        self.lora_dropout[adapter_name] = lora_dropout_layer
+
+        megatron_config = parallel_linear_kwargs["megatron_config"]
+        # lora needs to be forced to upgrade to 32-bit precision, otherwise it will overflow
+        megatron_config.params_dtype = torch.float32
+        if self.is_parallel_a:
+            lora_a = self.backend.RowParallelLinear(
+                input_size=self.in_features,
+                output_size=r,
+                bias=False,
+                input_is_parallel=input_is_parallel,
+                skip_bias_add=True,
+                init_method=init_method,
+                config=megatron_config,
+            )
+            lora_b = nn.Linear(in_features=r, out_features=self.out_features, bias=False, dtype=torch.float32)
+        else:
+            lora_a = nn.Linear(in_features=self.in_features, out_features=r, bias=False, dtype=torch.float32)
+            lora_b = self.backend.ColumnParallelLinear(
+                input_size=r,
+                output_size=self.out_features,
+                bias=False,
+                gather_output=gather_output,
+                init_method=init_method,
+                config=megatron_config,
+            )
+        self.lora_A[adapter_name] = lora_a
+        self.lora_B[adapter_name] = lora_b
+        if use_rslora:
+            self.scaling[adapter_name] = lora_alpha / (r**0.5)
+        else:
+            self.scaling[adapter_name] = lora_alpha / r
+        if init_lora_weights:
+            self.reset_lora_parameters(adapter_name, init_lora_weights)
+
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any):
+        previous_dtype = x.dtype
+        # If weight is used for matrix multiplication here, the final aggregation operation of the original
+        # parallel_linear layer will be missing, so we need to directly call its forward function to obtain the
+        # output of the original parallel_linear layer.
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result, bias = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result, bias = self.base_layer(x, *args, **kwargs)
+        else:
+            result, bias = self.base_layer(x, *args, **kwargs)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.lora_A.keys():
+                    continue
+                lora_A = self.lora_A[active_adapter]
+                lora_B = self.lora_B[active_adapter]
+                dropout = self.lora_dropout[active_adapter]
+                scaling = self.scaling[active_adapter]
+                x = x.to(lora_A.weight.dtype)
+
+                lora_result = lora_A(dropout(x))
+                if isinstance(lora_result, tuple):
+                    lora_result = lora_result[0]
+                lora_result = lora_B(lora_result)
+                if isinstance(lora_result, tuple):
+                    lora_result = lora_result[0]
+                lora_result = lora_result * scaling
+
+                result = result + lora_result
+
+        result = result.to(previous_dtype)
+        return result, bias
+
+
+def dispatch_megatron(
+    target: torch.nn.Module,
+    adapter_name: str,
+    lora_config,
+    **kwargs: Any,
+) -> Optional[torch.nn.Module]:
+    new_module = None
+
+    if isinstance(target, BaseTunerLayer):
+        target_base_layer = target.get_base_layer()
+    else:
+        target_base_layer = target
+
+    if lora_config.megatron_config:
+        megatron_core = importlib.import_module(lora_config.megatron_core)
+    else:
+        megatron_core = None
+
+    if megatron_core and isinstance(
+        target_base_layer,
+        (megatron_core.tensor_parallel.ColumnParallelLinear, megatron_core.tensor_parallel.RowParallelLinear),
+    ):
+        megatron_kwargs = kwargs.copy()
+        megatron_config = lora_config.megatron_config
+        if isinstance(megatron_config, dict):
+            transformer_config_class = megatron_core.transformer.transformer_config.TransformerConfig
+            megatron_config = transformer_config_class(**lora_config.megatron_config)
+        megatron_kwargs["megatron_config"] = megatron_config
+        if megatron_kwargs["fan_in_fan_out"]:
+            warnings.warn(
+                "fan_in_fan_out is set to True but the target module is `ColumnParallelLinear` "
+                "or `RowParallelLinear`. "
+                "Setting fan_in_fan_out to False."
+            )
+            megatron_kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False
+        new_module = LoraParallelLinear(
+            base_layer=target, adapter_name=adapter_name, backend=megatron_core.tensor_parallel, **megatron_kwargs
+        )
+
+    return new_module
diff --git a/MoRA/peft_mora/tuners/lycoris_utils.py b/MoRA/peft_mora/tuners/lycoris_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eaa6af8aa942d4a7fd3254db5ed1f64c1743efa
--- /dev/null
+++ b/MoRA/peft_mora/tuners/lycoris_utils.py
@@ -0,0 +1,428 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import warnings
+from abc import abstractmethod
+from dataclasses import dataclass, field
+from typing import Any, Optional, Union
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from peft_mora.config import PeftConfig
+from peft_mora.utils import (
+    ModulesToSaveWrapper,
+    _get_submodules,
+)
+
+from .tuners_utils import BaseTuner, BaseTunerLayer, check_adapters_to_merge, check_target_module_exists
+
+
+@dataclass
+class LycorisConfig(PeftConfig):
+    r"""
+    A base config for LyCORIS like adapters
+    """
+
+    rank_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}"
+            )
+        },
+    )
+    alpha_pattern: Optional[dict] = field(
+        default_factory=dict,
+        metadata={
+            "help": (
+                "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `alpha`. "
+                "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}"
+            )
+        },
+    )
+
+
+class LycorisLayer(BaseTunerLayer):
+    r"""
+    A base layer for LyCORIS like adapters
+    """
+
+    # adapter_layer_names needs to be defined on the child class
+    other_param_names = ("r", "alpha", "scaling", "rank_dropout", "module_dropout")
+
+    def __init__(self, base_layer: nn.Module) -> None:
+        self.base_layer = base_layer
+        self.r = {}
+        self.alpha = {}
+        self.scaling = {}
+        self.rank_dropout = {}
+        self.module_dropout = {}
+
+        # Tuner info
+        self._disable_adapters = False
+        self.merged_adapters = []
+
+    @property
+    @abstractmethod
+    def _available_adapters(self) -> set[str]:
+        ...
+
+    def _init_empty_weights(self, cls, *args, **kwargs) -> None:
+        # A helper method that allows to initialize the layer of the given class without spending time to initialize the
+        # model weights. The implementation is inspired by
+        # https://pytorch.org/docs/stable/generated/torch.nn.utils.skip_init.html but this function cannot be used
+        # directly.
+        # Instead of this approach, it would be possible to bypass the __init__ of the class but that runs the risk of
+        # omitting important logic inside that __init__.
+        kwargs = kwargs.copy()
+        final_device = kwargs.pop("device", "cpu")
+        cls.__init__(self, *args, device="meta", **kwargs)
+        self.to_empty(device=final_device)
+
+    @abstractmethod
+    def create_adapter_parameters(self, adapter_name: str, r: int, **kwargs):
+        ...
+
+    # TODO: refactor LoRA to use the same approach
+    @abstractmethod
+    def _get_delta_activations(self, adapter_name: str, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+        """Activations added on top of the base layer output (i.e. after the base layer forward pass)"""
+
+    @abstractmethod
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        ...
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If `None`, all active adapters will be merged.
+                Defaults to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self._available_adapters:
+                base_layer = self.get_base_layer()
+                if safe_merge:
+                    orig_weights = base_layer.weight.data.clone()
+                    orig_weights += self.get_delta_weight(active_adapter)
+
+                    if not torch.isfinite(orig_weights).all():
+                        raise ValueError(
+                            f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                        )
+
+                    base_layer.weight.data = orig_weights
+                else:
+                    base_layer.weight.data += self.get_delta_weight(active_adapter)
+                self.merged_adapters.append(active_adapter)
+
+    @abstractmethod
+    def reset_adapter_parameters(self, adapter_name: str):
+        ...
+
+    def set_scale(self, adapter, scale):
+        if adapter not in self._available_adapters:
+            # Ignore the case where the adapter is not in the layer
+            return
+        self.scaling[adapter] = scale * self.alpha[adapter] / self.r[adapter]
+
+    def scale_layer(self, scale: float) -> None:
+        if scale == 1:
+            return
+
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self._available_adapters:
+                continue
+
+            self.scaling[active_adapter] *= scale
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self._available_adapters:
+                self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter)
+
+    def unscale_layer(self, scale=None) -> None:
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self._available_adapters:
+                continue
+
+            if scale is None:
+                self.scaling[active_adapter] = self.alpha[active_adapter] / self.r[active_adapter]
+            else:
+                self.scaling[active_adapter] /= scale
+
+    @abstractmethod
+    def update_layer(self, adapter_name: str, r: int, alpha: float, **kwargs):
+        ...
+
+
+class LycorisTuner(BaseTuner):
+    r"""
+    A base tuner for LyCORIS like adapters
+    """
+
+    prefix: str
+    layers_mapping: dict[type[torch.nn.Module], type[LycorisLayer]]
+
+    def __init__(self, model, config, adapter_name):
+        super().__init__(model, config, adapter_name)
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    @staticmethod
+    def _check_target_module_exists(config, key):
+        return check_target_module_exists(config, key)
+
+    @abstractmethod
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[LycorisLayer, nn.Module],
+        target_name,
+        parent,
+        current_key,
+    ):
+        ...
+
+    @classmethod
+    def _create_new_module(cls, config: LycorisConfig, adapter_name: str, target: nn.Module, **kwargs) -> LycorisLayer:
+        # Find corresponding subtype of provided target module
+        new_module_cls = None
+        for subtype, target_cls in cls.layers_mapping.items():
+            if (
+                hasattr(target, "base_layer")
+                and isinstance(target.get_base_layer(), subtype)
+                and isinstance(target, BaseTunerLayer)
+            ):
+                # nested tuner layers are allowed
+                new_module_cls = target_cls
+                break
+            elif isinstance(target, subtype):
+                new_module_cls = target_cls
+                break
+
+        # We didn't find corresponding type, so adapter for this layer is not supported
+        if new_module_cls is None:
+            supported_modules = ", ".join(layer.__name__ for layer in cls.layers_mapping.keys())
+            raise ValueError(
+                f"Target module of type {type(target)} not supported, "
+                f"currently only adapters for {supported_modules} are supported"
+            )
+
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if isinstance(target_base_layer, torch.nn.Conv2d):
+            new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs)
+        elif isinstance(target_base_layer, torch.nn.Linear):
+            new_module = new_module_cls(target, adapter_name=adapter_name, **kwargs)
+        else:
+            supported_modules = ", ".join(layer.__name__ for layer in cls.layers_mapping.keys())
+            raise ValueError(
+                f"Target module of type {type(target)} not supported, "
+                f"currently only adapters for {supported_modules} are supported"
+            )
+
+        return new_module
+
+    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            raise ValueError("Please specify `target_modules` in `peft_config`")
+        return peft_config
+
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if self.prefix in name:
+                module.to(child.weight.device)
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def _unload_and_optionally_merge(
+        self,
+        merge: bool = True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[list[str]] = None,
+    ):
+        if merge:
+            if getattr(self.model, "quantization_method", None) == "gptq":
+                raise ValueError("Cannot merge LOHA layers when the model is gptq quantized")
+
+        self._unloading_checks(adapter_names)
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+
+            if hasattr(target, "base_layer"):
+                if merge:
+                    target.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
+                new_module = target.modules_to_save[target.active_adapter]
+                if hasattr(new_module, "base_layer"):
+                    # check if the module is itself a tuner layer
+                    if merge:
+                        new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                    new_module = new_module.get_base_layer()
+                setattr(parent, target_name, new_module)
+
+        return self.model
+
+    def enable_adapter_layers(self) -> None:
+        """Enable all adapters.
+
+        Call this if you have previously disabled all adapters and want to re-enable them.
+        """
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self) -> None:
+        """Disable all adapters.
+
+        When disabling all adapters, the model output corresponds to the output of the base model.
+        """
+        self._set_adapter_layers(enabled=False)
+
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
+    ) -> torch.nn.Module:
+        r"""
+        This method merges the adapter layers into the base model. This is needed if someone wants to use the base
+        model as a standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+
+    def unload(self) -> torch.nn.Module:
+        """
+        Gets back the base model by removing all the lora modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
+
+    def set_adapter(self, adapter_name: str | list[str]) -> None:
+        """Set the active adapter(s).
+
+        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
+        not desired, use the following code.
+
+        ```py
+        >>> for name, param in model_peft.named_parameters():
+        ...     if ...:  # some check on name (ex. if 'lora' in name)
+        ...         param.requires_grad = False
+        ```
+
+        Args:
+            adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated.
+        """
+        for module in self.model.modules():
+            if isinstance(module, LycorisLayer):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+
+    def delete_adapter(self, adapter_name: str) -> None:
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (`str`): Name of the adapter to be deleted.
+        """
+        if adapter_name not in list(self.peft_config.keys()):
+            raise ValueError(f"Adapter {adapter_name} does not exist")
+        del self.peft_config[adapter_name]
+
+        key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key]
+        new_adapter = None
+        for key in key_list:
+            _, target, _ = _get_submodules(self.model, key)
+            if isinstance(target, LycorisLayer):
+                target.delete_adapter(adapter_name)
+                if new_adapter is None:
+                    new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
diff --git a/MoRA/peft_mora/tuners/mixed/__init__.py b/MoRA/peft_mora/tuners/mixed/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2955d7258ddcf76b47b38fd6fd5ebeb3d1d6110c
--- /dev/null
+++ b/MoRA/peft_mora/tuners/mixed/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .model import COMPATIBLE_TUNER_TYPES, MixedModel
+
+
+__all__ = ["COMPATIBLE_TUNER_TYPES", "MixedModel"]
diff --git a/MoRA/peft_mora/tuners/mixed/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/mixed/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1b1ca38b414cba3d28434168d863a7b916b0e87
Binary files /dev/null and b/MoRA/peft_mora/tuners/mixed/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/mixed/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/mixed/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1ac119ac5321e1412b8d8f5a89b38e2afbfc1313
Binary files /dev/null and b/MoRA/peft_mora/tuners/mixed/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/mixed/model.py b/MoRA/peft_mora/tuners/mixed/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a660d6fe9ef2967749abefdf9508f8493d79aa6
--- /dev/null
+++ b/MoRA/peft_mora/tuners/mixed/model.py
@@ -0,0 +1,339 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import warnings
+from typing import Any, Optional, Union
+
+from torch import nn
+from tqdm import tqdm
+
+from peft_mora.tuners import adalora, loha, lokr, lora, oft
+from peft_mora.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft_mora.utils import (
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+    PeftType,
+    _get_submodules,
+    get_auto_gptq_quant_linear,
+)
+
+
+# Collection of constants used for all tuners
+COMPATIBLE_TUNER_TYPES = (PeftType.LORA, PeftType.LOHA, PeftType.LOKR, PeftType.ADALORA, PeftType.OFT)
+PREFIXES = [lora.LoraModel.prefix, lokr.LoKrModel.prefix, loha.LoHaModel.prefix, oft.OFTModel.prefix]
+Configs = Union[lora.LoraConfig, loha.LoHaConfig, lokr.LoKrConfig, adalora.AdaLoraConfig, oft.OFTConfig]
+Layers = (lora.layer.LoraLayer, loha.layer.LoHaLayer, lokr.layer.LoKrLayer, adalora.layer.AdaLoraLayer, oft.OFTLayer)
+
+
+class MixedModel(BaseTuner):
+    """
+    A class that allows to mix different types of adapters in a single model.
+
+    Note: This class should usually not be initialized directly. Instead, use `get_peft_model` with the argument
+    `mixed=True`.
+
+    Args:
+        model (:obj:`nn.Module`):
+            The model to be tuned.
+        config (:obj:`PeftConfig`):
+            The config of the model to be tuned. The adapter type must be compatible.
+        adapter_name (:obj:`str`):
+            The name of the first adapter.
+    """
+
+    def __init__(self, model: nn.Module, config: Configs, adapter_name: str) -> None:
+        super().__init__(model, config, adapter_name)
+
+    def _check_new_adapter_config(self, config: Configs) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        if not isinstance(config, Configs.__args__):
+            raise ValueError(
+                f"{self.__class__.__name__} only supports {COMPATIBLE_TUNER_TYPES} configs, but got {type(config)}."
+            )
+
+        biases = (getattr(config, "bias", None) for config in self.peft_config)
+        biases = [bias for bias in biases if bias not in (None, "none")]
+        if len(biases) > 1:
+            raise ValueError(
+                f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, "
+                "set bias to 'none' for all adapters."
+            )
+
+    @staticmethod
+    def _check_target_module_exists(config: Configs, key: str):
+        return check_target_module_exists(config, key)
+
+    def _create_and_replace(
+        self,
+        config: Configs,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        if isinstance(config, adalora.AdaLoraConfig):
+            adalora.AdaLoraModel._create_and_replace(self, config, *args, **kwargs)
+        elif isinstance(config, lora.LoraConfig):
+            lora.LoraModel._create_and_replace(self, config, *args, **kwargs)
+        elif isinstance(config, loha.LoHaConfig):
+            loha.LoHaModel._create_and_replace(self, config, *args, **kwargs)
+        elif isinstance(config, lokr.LoKrConfig):
+            lokr.LoKrModel._create_and_replace(self, config, *args, **kwargs)
+        elif isinstance(config, oft.OFTConfig):
+            oft.OFTModel._create_and_replace(self, config, *args, **kwargs)
+        else:
+            raise ValueError(f"Unsupported config type {type(config)}, should be one of {COMPATIBLE_TUNER_TYPES}.")
+
+    def _replace_module(self, parent, child_name, new_module, child) -> None:
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.get_base_layer()
+        elif hasattr(child, "quant_linear_module"):
+            # TODO maybe not necessary to have special treatment?
+            child = child.quant_linear_module
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if any(prefix in name for prefix in PREFIXES):
+                module.to(child.weight.device)
+            if "ranknum" in name:
+                module.to(child.weight.device)
+
+    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
+        for n, p in model.named_parameters():
+            if not any(prefix in n for prefix in PREFIXES):
+                p.requires_grad = False
+
+        for active_adapter in self.active_adapters:
+            bias = getattr(self.peft_config[active_adapter], "bias", "none")
+            if bias == "none":
+                continue
+
+            if bias == "all":
+                for n, p in model.named_parameters():
+                    if "bias" in n:
+                        p.requires_grad = True
+            elif bias == "lora_only":
+                # TODO: check if this is needed for other supported types
+                for m in model.modules():
+                    if isinstance(m, Layers) and hasattr(m, "bias") and m.bias is not None:
+                        m.bias.requires_grad = True
+            else:
+                raise ValueError(f"Requested bias: {bias}, is not implemented.")
+
+    @staticmethod
+    def _create_new_module(config, adapter_name, target, **kwargs):
+        gptq_quantization_config = kwargs.get("gptq_quantization_config", None)
+        AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
+        if (gptq_quantization_config is not None) or (AutoGPTQQuantLinear is not None):
+            raise ValueError(f"GPTQ quantization not supported for {config.peft_type.value} (yet).")
+
+        loaded_in_8bit = kwargs.pop("loaded_in_8bit", False)
+        loaded_in_4bit = kwargs.pop("loaded_in_4bit", False)
+        if loaded_in_8bit or loaded_in_4bit:
+            raise ValueError(f"8bit and 4bit quantization not supported for {config.peft_type.value} (yet).")
+
+        if isinstance(config, adalora.AdaLoraConfig):
+            new_module = adalora.AdaLoraModel._create_new_module(config, adapter_name, target, **kwargs)
+        elif isinstance(config, lora.LoraConfig):
+            new_module = lora.LoraModel._create_new_module(config, adapter_name, target, **kwargs)
+        elif isinstance(config, loha.LoHaConfig):
+            new_module = loha.LoHaModel._create_new_module(config, adapter_name, target, **kwargs)
+        elif isinstance(config, lokr.LoKrConfig):
+            new_module = lokr.LoKrModel._create_new_module(config, adapter_name, target, **kwargs)
+        elif isinstance(config, oft.OFTConfig):
+            new_module = oft.OFTModel._create_new_module(config, adapter_name, target, **kwargs)
+        else:
+            raise ValueError(f"Unknown config type {type(config)}, should be one of {COMPATIBLE_TUNER_TYPES}.")
+        return new_module
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        for active_adapter in self.active_adapters:
+            val = getattr(self.peft_config[active_adapter], "bias", "none")
+            if val != "none":
+                msg = (
+                    f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same "
+                    "output as the the base model would without adaption."
+                )
+                warnings.warn(msg)
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name: Union[str, list[str]]) -> None:
+        for module in self.model.modules():
+            if isinstance(module, Layers):
+                if module.merged:
+                    warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.")
+                    module.unmerge()
+                module.set_adapter(adapter_name)
+        self.active_adapter = adapter_name
+
+    @staticmethod
+    def _prepare_adapter_config(peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _unload_and_optionally_merge(
+        self,
+        merge=True,
+        progressbar: bool = False,
+        safe_merge: bool = False,
+        adapter_names: Optional[list[str]] = None,
+    ):
+        if merge:
+            if getattr(self.model, "quantization_method", None) == "gptq":
+                raise ValueError("Cannot merge layers when the model is gptq quantized")
+
+        def merge_recursively(module):
+            # helper function to recursively merge the base_layer of the target
+            path = []
+            layer = module
+            while hasattr(layer, "base_layer"):
+                path.append(layer)
+                layer = layer.base_layer
+            for layer_before, layer_after in zip(path[:-1], path[1:]):
+                layer_after.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                layer_before.base_layer = layer_after.base_layer
+            module.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+
+        key_list = [key for key, _ in self.model.named_modules() if not any(prefix in key for prefix in PREFIXES)]
+        desc = "Unloading " + ("and merging " if merge else "") + "model"
+
+        for key in tqdm(key_list, disable=not progressbar, desc=desc):
+            try:
+                parent, target, target_name = _get_submodules(self.model, key)
+            except AttributeError:
+                continue
+
+            if hasattr(target, "base_layer"):
+                if merge:
+                    merge_recursively(target)
+                self._replace_module(parent, target_name, target.get_base_layer(), target)
+            elif isinstance(target, ModulesToSaveWrapper):
+                # save any additional trainable modules part of `modules_to_save`
+                new_module = target.modules_to_save[target.active_adapter]
+                if hasattr(new_module, "base_layer"):
+                    # check if the module is itself a tuner layer
+                    if merge:
+                        new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names)
+                    new_module = new_module.get_base_layer()
+                setattr(parent, target_name, new_module)
+
+        return self.model
+
+    def add_weighted_adapter(self, *args: Any, **kwargs: Any) -> None:
+        raise NotImplementedError(f"Weighted adapters are not supported for {self.__class__.__name__} (yet).")
+
+    def delete_adapter(self, adapter_name: Union[str, list[str]]) -> None:
+        """
+        Deletes an existing adapter.
+
+        Args:
+            adapter_name (Union[str, list[str]]): Name of the adapter(s) to delete.
+        """
+        if isinstance(adapter_name, str):
+            adapter_names = [adapter_name]
+        else:
+            adapter_names = adapter_name
+
+        mismatched = set(adapter_names) - set(self.peft_config.keys())
+        if mismatched:
+            raise ValueError(
+                f"Adapter(s) {sorted(mismatched)} not found, available adapters: {sorted(self.peft_config.keys())}"
+            )
+
+        for adapter_name in adapter_names:
+            del self.peft_config[adapter_name]
+
+            key_list = [key for key, _ in self.model.named_modules() if not any(prefix in key for prefix in PREFIXES)]
+            new_adapter = None
+            for key in key_list:
+                _, target, _ = _get_submodules(self.model, key)
+                if isinstance(target, BaseTunerLayer):
+                    target.delete_adapter(adapter_name)
+                    if new_adapter is None:
+                        new_adapter = target.active_adapters[:]
+
+        self.active_adapter = new_adapter or []
+
+    def merge_and_unload(
+        self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None
+    ) -> nn.Module:
+        r"""
+        This method merges the layers into the base model. This is needed if someone wants to use the base model as a
+        standalone model.
+
+        Args:
+            progressbar (`bool`):
+                whether to show a progressbar indicating the unload and merge process
+            safe_merge (`bool`):
+                whether to activate the safe merging check to check if there is any potential Nan in the adapter
+                weights
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults
+                to `None`.
+        """
+        return self._unload_and_optionally_merge(
+            progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names
+        )
+
+    def unload(self) -> nn.Module:
+        """
+        Gets back the base model by removing all the lora modules without merging. This gives back the original base
+        model.
+        """
+        return self._unload_and_optionally_merge(merge=False)
+
+    def generate(self, *args: Any, **kwargs: Any):
+        return self.model.generate(*args, **kwargs)
diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/__init__.py b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..214f7722486485bea4ede3b5c1a433aac447dd2b
--- /dev/null
+++ b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit
+from .model import MultitaskPromptEmbedding
+
+
+__all__ = ["MultitaskPromptTuningConfig", "MultitaskPromptTuningInit", "MultitaskPromptEmbedding"]
diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cef42d1dc07a447378c63f4c398ce54713af2cd8
Binary files /dev/null and b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10e711620d8d551562b91eaf69ceff945fd003eb
Binary files /dev/null and b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ffef858cdcd56a0df6a779d40f7b39e2dad4f5f
Binary files /dev/null and b/MoRA/peft_mora/tuners/multitask_prompt_tuning/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/config.py b/MoRA/peft_mora/tuners/multitask_prompt_tuning/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ccd12e2df8bc6869ae00a24cef946f8fa37b51a
--- /dev/null
+++ b/MoRA/peft_mora/tuners/multitask_prompt_tuning/config.py
@@ -0,0 +1,61 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+from peft_mora.tuners.prompt_tuning import PromptTuningConfig
+from peft_mora.utils import PeftType
+
+
+class MultitaskPromptTuningInit(str, enum.Enum):
+    # initialize prompt with text
+    TEXT = "TEXT"
+    # initialize prompt with random matrix
+    RANDOM = "RANDOM"
+    # average the prefix and column matrices obtained during source training
+    AVERAGE_SOURCE_TASKS = "AVERAGE_SOURCE_TASKS"
+    # pick prefix and column matrices for a particular task obtained during source training
+    EXACT_SOURCE_TASK = "EXACT_SOURCE_TASK"
+    # only use the prompt embeddings trained during source training
+    ONLY_SOURCE_SHARED = "ONLY_SOURCE_SHARED"
+
+
+@dataclass
+class MultitaskPromptTuningConfig(PromptTuningConfig):
+    prompt_tuning_init: Union[MultitaskPromptTuningInit, str] = field(
+        default=MultitaskPromptTuningInit.RANDOM,
+        metadata={
+            "help": (
+                "How to initialize the prompt tuning parameters. Can be one of TEXT, RANDOM, AVERAGE_SOURCE_TASKS, "
+                "EXACT_SOURCE_TASK, ONLY_SOURCE_SHARED."
+            ),
+        },
+    )
+    prompt_tuning_init_state_dict_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The path of source state dict. This is required when training the downstream target prompt from "
+                "the pretrained source prompt"
+            ),
+        },
+    )
+    prompt_tuning_init_task: Optional[int] = field(default=0, metadata={"help": "source task id for initialization"})
+    num_ranks: Optional[int] = field(default=1, metadata={"help": "ranks"})
+    num_tasks: Optional[int] = field(default=1, metadata={"help": "number of tasks"})
+
+    def __post_init__(self):
+        self.peft_type = PeftType.MULTITASK_PROMPT_TUNING
diff --git a/MoRA/peft_mora/tuners/multitask_prompt_tuning/model.py b/MoRA/peft_mora/tuners/multitask_prompt_tuning/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..b317b94c8c07afe3a85367cf045bc2be54ace773
--- /dev/null
+++ b/MoRA/peft_mora/tuners/multitask_prompt_tuning/model.py
@@ -0,0 +1,115 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from peft_mora.tuners.prompt_tuning import PromptEmbedding
+from peft_mora.utils import TaskType
+
+from .config import MultitaskPromptTuningConfig, MultitaskPromptTuningInit
+
+
+# This code is adapted for the paper: https://arxiv.org/abs/2303.02861 and
+# constitutes the work done at MIT-IBM Watson Research Lab.
+
+
+class MultitaskPromptEmbedding(PromptEmbedding):
+    def __init__(self, config: MultitaskPromptTuningConfig, word_embeddings):
+        super().__init__(config, word_embeddings)
+
+        self.num_tasks = config.num_tasks
+        self.num_ranks = config.num_ranks
+        self.num_virtual_tokens = config.num_virtual_tokens
+
+        self.num_transformer_submodules = config.num_transformer_submodules
+        if self.num_transformer_submodules is None:
+            self.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1
+
+        self.token_dim = config.token_dim
+
+        total_virtual_tokens = self.num_virtual_tokens * self.num_transformer_submodules
+
+        self.prefix_task_cols = torch.nn.Parameter(
+            torch.normal(
+                mean=0,
+                std=0.02,
+                size=(self.num_tasks, total_virtual_tokens, self.num_ranks),
+            )
+        )
+        self.prefix_task_rows = torch.nn.Parameter(
+            torch.normal(
+                mean=0,
+                std=0.02,
+                size=(self.num_tasks, self.num_ranks, self.token_dim),
+            )
+        )
+
+        if config.prompt_tuning_init in [
+            MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS,
+            MultitaskPromptTuningInit.EXACT_SOURCE_TASK,
+            MultitaskPromptTuningInit.ONLY_SOURCE_SHARED,
+        ]:
+            if config.prompt_tuning_init_state_dict_path is None:
+                raise ValueError(
+                    f"prompt_tuning_init_state_dict_path needs to be specified with {config.prompt_tuning_init} "
+                    "init method"
+                )
+
+            # TODO: There should be an option for safetensors
+            state_dict: dict = torch.load(
+                config.prompt_tuning_init_state_dict_path,
+                map_location=word_embeddings.weight.device,
+            )
+
+        if config.prompt_tuning_init in [
+            MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS,
+            MultitaskPromptTuningInit.EXACT_SOURCE_TASK,
+        ]:
+            prefix_task_cols_: torch.Tensor = state_dict["prefix_task_cols"]
+            prefix_task_rows_: torch.Tensor = state_dict["prefix_task_rows"]
+
+            if config.prompt_tuning_init == MultitaskPromptTuningInit.AVERAGE_SOURCE_TASKS:
+                prefix_task_cols_ = prefix_task_cols_.mean(0, keepdim=True)
+                prefix_task_rows_ = prefix_task_rows_.mean(0, keepdim=True)
+            elif config.prompt_tuning_init == MultitaskPromptTuningInit.EXACT_SOURCE_TASK:
+                prefix_task_cols_ = prefix_task_cols_[config.prompt_tuning_init_task, ...].unsqueeze(0)
+                prefix_task_rows_ = prefix_task_rows_[config.prompt_tuning_init_task, ...].unsqueeze(0)
+
+            state_dict = {
+                "embedding.weight": state_dict["prompt_embeddings"],
+                "prefix_task_cols": prefix_task_cols_,
+                "prefix_task_rows": prefix_task_rows_,
+            }
+
+            self.load_state_dict(state_dict, strict=True)
+        elif config.prompt_tuning_init == MultitaskPromptTuningInit.ONLY_SOURCE_SHARED:
+            state_dict = {
+                "embedding.weight": state_dict["prompt_embeddings"],
+            }
+
+            self.load_state_dict(state_dict, strict=False)
+
+    def forward(self, indices, task_ids):
+        if task_ids is None:
+            raise ValueError("task_ids cannot be None")
+
+        prompt_embeddings = self.embedding(indices)
+
+        task_cols = torch.index_select(self.prefix_task_cols, 0, task_ids)
+        task_rows = torch.index_select(self.prefix_task_rows, 0, task_ids)
+        task_prompts = torch.matmul(task_cols, task_rows)
+
+        prompt_embeddings *= task_prompts
+
+        return prompt_embeddings
diff --git a/MoRA/peft_mora/tuners/oft/__init__.py b/MoRA/peft_mora/tuners/oft/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ac7131e24bd5cf39bf97ab6336ed1f1d46e152
--- /dev/null
+++ b/MoRA/peft_mora/tuners/oft/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import OFTConfig
+from .layer import Conv2d, Linear, OFTLayer
+from .model import OFTModel
+
+
+__all__ = ["OFTConfig", "OFTModel", "Conv2d", "Linear", "OFTLayer"]
diff --git a/MoRA/peft_mora/tuners/oft/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/oft/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9908cd15f1115e8f6d388f6583e0dd421e7a4b9
Binary files /dev/null and b/MoRA/peft_mora/tuners/oft/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/oft/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/oft/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3072f6cd987abe6bcfd9190552e2a74067daede9
Binary files /dev/null and b/MoRA/peft_mora/tuners/oft/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/oft/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/oft/__pycache__/layer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8864ed7cdb39cd89a6140196bc359a03775f8ef
Binary files /dev/null and b/MoRA/peft_mora/tuners/oft/__pycache__/layer.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/oft/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/oft/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0348b67b0f06e2d7257140ca2fab33c4333490ae
Binary files /dev/null and b/MoRA/peft_mora/tuners/oft/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/oft/config.py b/MoRA/peft_mora/tuners/oft/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a6ed2ba8f04a57b878d17ee482e454d23617b2d
--- /dev/null
+++ b/MoRA/peft_mora/tuners/oft/config.py
@@ -0,0 +1,119 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+from peft_mora.tuners.lycoris_utils import LycorisConfig
+from peft_mora.utils import PeftType
+
+
+@dataclass
+class OFTConfig(LycorisConfig):
+    """
+    This is the configuration class to store the configuration of a [`OFTModel`].
+
+    Args:
+        r (`int`): OFT rank.
+        module_dropout (`int`): The dropout probability for disabling OFT modules during training.
+        target_modules (`Optional[Union[List[str], str]]`):
+            The names of the modules to apply the adapter to. If this is specified, only the modules with the specified
+            names will be replaced. When passing a string, a regex match will be performed. When passing a list of
+            strings, either an exact match will be performed or it is checked if the name of the module ends with any
+            of the passed strings. If this is specified as 'all-linear', then all linear modules are chosen, excluding
+            the output layer. If this is not specified, modules will be chosen according to the model architecture. If
+            the architecture is not known, an error will be raised -- in this case, you should specify the target
+            modules manually.
+        init_weights (`bool`):
+            Whether to perform initialization of OFT weights.
+        layers_to_transform (`Union[List[int], int]`):
+            The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices
+            that are specified in this list. If a single integer is passed, it will apply the transformations on the
+            layer at this index.
+        layers_pattern (`str`):
+            The layer pattern name, used only if `layers_to_transform` is different from `None`.
+        rank_pattern (`dict`):
+            The mapping from layer names or regexp expression to ranks which are different from the default rank
+            specified by `r`.
+        modules_to_save (`List[str]`):
+            List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint.
+        coft (`bool`):
+            Whether to use the constrained variant of OFT or not, off by default.
+        eps (`float`):
+            The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True.
+        block_share (`bool`):
+            Whether to share the OFT parameters between blocks or not. This is `False` by default.
+    """
+
+    r: int = field(default=8, metadata={"help": "OFT rank"})
+    module_dropout: float = field(
+        default=0.0, metadata={"help": "The dropout probability for disabling OFT modules during training"}
+    )
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with OFT."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+            "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer."
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the OFT layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    layers_to_transform: Optional[Union[List[int], int]] = field(
+        default=None,
+        metadata={
+            "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index."
+        },
+    )
+    layers_pattern: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern."
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from OFT layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+    coft: bool = field(
+        default=False,
+        metadata={"help": "Whether to use the constrained variant of OFT or not."},
+    )
+    eps: float = field(
+        default=6e-5,
+        metadata={
+            "help": "The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True."
+        },
+    )
+    block_share: bool = field(
+        default=False,
+        metadata={"help": "Whether to share the OFT parameters between blocks or not."},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.OFT
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
diff --git a/MoRA/peft_mora/tuners/oft/layer.py b/MoRA/peft_mora/tuners/oft/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ed3df15bd94eaadf0518497a1e1622b7aec86b5
--- /dev/null
+++ b/MoRA/peft_mora/tuners/oft/layer.py
@@ -0,0 +1,388 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import Any, List, Optional, Set, Tuple
+
+import torch
+import torch.nn as nn
+
+from peft_mora.tuners.lycoris_utils import LycorisLayer, check_adapters_to_merge
+
+
+class OFTLayer(nn.Module, LycorisLayer):
+    # All names of layers that may contain adapter weights
+    adapter_layer_names = ("oft_r",)
+    # other_param_names is defined on parent class
+
+    def __init__(self, base_layer: nn.Module):
+        super().__init__()
+        LycorisLayer.__init__(self, base_layer)
+
+        # OFT info
+        self.oft_r = nn.ParameterDict({})
+        self.coft = {}
+        self.eps = {}
+        self.block_share = {}
+
+    @property
+    def _available_adapters(self) -> Set[str]:
+        return {*self.oft_r}
+
+    def create_adapter_parameters(self, adapter_name: str, r: int, shape: Tuple[int, ...], block_share: bool):
+        if block_share:
+            self.oft_r[adapter_name] = nn.Parameter(torch.empty(1, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
+        else:
+            self.oft_r[adapter_name] = nn.Parameter(torch.empty(r, math.ceil(shape[0] / r), math.ceil(shape[0] / r)))
+
+    def reset_adapter_parameters(self, adapter_name: str):
+        nn.init.zeros_(self.oft_r[adapter_name])
+
+    def reset_adapter_parameters_random(self, adapter_name: str):
+        nn.init.kaiming_uniform_(self.oft_r[adapter_name], a=math.sqrt(5))
+
+    def update_layer(
+        self,
+        adapter_name: str,
+        r: int,
+        module_dropout: float,
+        init_weights: bool,
+        coft: bool = False,
+        eps: float = 6e-5,
+        block_share: bool = False,
+        **kwargs,
+    ) -> None:
+        """Internal function to create oft adapter
+
+        Args:
+            adapter_name (`str`): Name for the adapter to add.
+            r (`int`): Rank for the added adapter.
+            module_dropout (`float`): The dropout probability for disabling adapter during training.
+            init_weights (`bool`): Whether to initialize weights.
+            coft (`bool`): Whether to use the constrained variant of OFT or not.
+            eps (`float`):
+                The control strength of COFT. The freedom of rotation. Only has an effect if `coft` is set to True.
+            block_share (`bool`): Whether to share the OFT parameters between blocks or not.
+        """
+        if r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {r}")
+
+        self.r[adapter_name] = r
+        self.module_dropout[adapter_name] = module_dropout
+        self.coft[adapter_name] = coft
+        self.block_share[adapter_name] = block_share
+
+        # Determine shape of OFT weights
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            shape = tuple(base_layer.weight.shape)
+        elif isinstance(base_layer, nn.Conv2d):
+            shape = (
+                base_layer.out_channels,
+                base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+            )
+        else:
+            raise TypeError(f"OFT is not implemented for base layers of type {type(base_layer).__name__}")
+
+        self.eps[adapter_name] = eps * math.ceil(shape[0] / r) * math.ceil(shape[0] / r)
+
+        # Create weights with provided shape
+        self.create_adapter_parameters(adapter_name, r, shape, block_share)
+
+        # Initialize weights
+        if init_weights:
+            self.reset_adapter_parameters(adapter_name)
+        else:
+            self.reset_adapter_parameters_random(adapter_name)
+
+        # Move new weights to device
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def unscale_layer(self, scale=None) -> None:
+        # scale is not used
+        pass
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[List[str]] = None) -> None:
+        """
+        Merge the active adapter weights into the base weights
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`List[str]`, *optional*):
+                The list of adapter names that should be merged. If `None`, all active adapters will be merged.
+                Defaults to `None`.
+        """
+        adapter_names = check_adapters_to_merge(self, adapter_names)
+        if not adapter_names:
+            # no adapter to merge
+            return
+
+        for active_adapter in adapter_names:
+            if active_adapter in self._available_adapters:
+                base_layer = self.get_base_layer()
+
+                orig_weights = base_layer.weight.data
+                if isinstance(base_layer, nn.Linear):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    orig_weights = orig_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                        ]
+                    )
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                delta_weight = self.get_delta_weight(active_adapter)
+                if orig_weights.shape[1] != delta_weight.shape[1]:
+                    # when in channels is not divisible by r
+                    delta_weight = delta_weight[: orig_weights.shape[1], : orig_weights.shape[1]]
+                new_weights = torch.mm(orig_weights, delta_weight)
+                if isinstance(base_layer, nn.Linear):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                    new_weights = new_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels,
+                            base_layer.kernel_size[0],
+                            base_layer.kernel_size[1],
+                        ]
+                    )
+
+                if safe_merge and not torch.isfinite(new_weights).all():
+                    raise ValueError(
+                        f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken"
+                    )
+
+                base_layer.weight.data = new_weights
+                self.merged_adapters.append(active_adapter)
+
+    def unmerge(self) -> None:
+        """
+        This method unmerges all merged adapter layers from the base weights.
+        """
+        if not self.merged:
+            warnings.warn("Already unmerged. Nothing to do.")
+            return
+        while len(self.merged_adapters) > 0:
+            active_adapter = self.merged_adapters.pop()
+            if active_adapter in self._available_adapters:
+                base_layer = self.get_base_layer()
+                new_weights = base_layer.weight.data
+                if isinstance(base_layer, nn.Linear):
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    new_weights = new_weights.view(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels * base_layer.kernel_size[0] * base_layer.kernel_size[1],
+                        ]
+                    )
+                    new_weights = torch.transpose(new_weights, 0, 1)
+                delta_weight = self.get_delta_weight(active_adapter)
+                if new_weights.shape[1] != delta_weight.shape[1]:
+                    # when in channels is not divisible by r
+                    delta_weight = delta_weight[: new_weights.shape[1], : new_weights.shape[1]]
+                delta_inv = torch.inverse(delta_weight)
+                orig_weights = torch.mm(new_weights, delta_inv)
+
+                if isinstance(base_layer, nn.Linear):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                elif isinstance(base_layer, nn.Conv2d):
+                    orig_weights = torch.transpose(orig_weights, 0, 1)
+                    orig_weights = orig_weights.reshape(
+                        [
+                            base_layer.out_channels,
+                            base_layer.in_channels,
+                            base_layer.kernel_size[0],
+                            base_layer.kernel_size[1],
+                        ]
+                    )
+                base_layer.weight.data = orig_weights
+
+    def get_delta_weight(self, adapter_name: str) -> torch.Tensor:
+        rank = self.r[adapter_name]
+        coft = self.coft[adapter_name]
+        eps = self.eps[adapter_name]
+        opt_r = self.oft_r[adapter_name]
+
+        if coft:
+            with torch.no_grad():
+                opt_r.copy_(self._project_batch(opt_r, eps=eps))
+
+        orth_rotate = self._cayley_batch(opt_r)
+        weight = self._block_diagonal(orth_rotate, rank)
+
+        return weight
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L144
+    def _cayley_batch(self, data: torch.Tensor) -> torch.Tensor:
+        b, r, c = data.shape
+        # Ensure the input matrix is skew-symmetric
+        skew = 0.5 * (data - data.transpose(1, 2))
+        I = torch.eye(r, device=data.device).unsqueeze(0).expand(b, r, c)  # noqa: E741
+
+        # Perform the Cayley parametrization
+        Q = torch.bmm(I - skew, torch.inverse(I + skew))
+
+        return Q
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L155
+    def _block_diagonal(self, oft_r: torch.Tensor, rank: int) -> torch.Tensor:
+        if oft_r.shape[0] == 1:
+            # block share
+            blocks = [oft_r[0, ...] for i in range(rank)]
+        else:
+            blocks = [oft_r[i, ...] for i in range(rank)]
+
+        # Use torch.block_diag to create the block diagonal matrix
+        A = torch.block_diag(*blocks)
+
+        return A
+
+    # Copied from https://github.com/Zeju1997/oft/blob/84cebb965df69781e3d9c3c875f5980b421eaf24/oft-control/oft.py#L52
+    def _project_batch(self, oft_r, eps=1e-5):
+        # scaling factor for each of the smaller block matrix
+        eps = eps * 1 / torch.sqrt(torch.tensor(oft_r.shape[0]))
+        I = (  # noqa: E741
+            torch.zeros((oft_r.size(1), oft_r.size(1)), device=oft_r.device, dtype=oft_r.dtype)
+            .unsqueeze(0)
+            .expand_as(oft_r)
+        )
+        diff = oft_r - I
+        norm_diff = torch.norm(oft_r - I, dim=(1, 2), keepdim=True)
+        mask = (norm_diff <= eps).bool()
+        out = torch.where(mask, oft_r, I + eps * (diff / norm_diff))
+        return out
+
+    def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        previous_dtype = x.dtype
+
+        if self.disable_adapters:
+            if self.merged:
+                self.unmerge()
+            result = self.base_layer(x, *args, **kwargs)
+        elif self.merged:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            if len(result.shape) == 4:
+                result = result.permute(0, 2, 3, 1)
+
+            base_layer = self.get_base_layer()
+            base_bias = base_layer.bias
+            if base_bias is not None:
+                # Bias should be added after OFT forward
+                result = result - base_bias.data
+
+            # Execute all the adapters
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self._available_adapters:
+                    continue
+
+                module_dropout = self.module_dropout[active_adapter]
+
+                # Modify current execution weights
+                if (not self.training) or (self.training and torch.rand(1) > module_dropout):
+                    result = self._get_delta_activations(active_adapter, result, *args, **kwargs)
+
+            if base_bias is not None:
+                result = result + base_bias.data
+            if len(result.shape) == 4:
+                result = result.permute(0, 3, 1, 2)
+
+        result = result.to(previous_dtype)
+        return result
+
+
+class Linear(OFTLayer):
+    """OFT implemented in Linear layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+
+        base_layer = self.get_base_layer()
+        base_weight = base_layer.weight.data
+        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+
+        # don't add bias here, because the bias will be added after OFT forward
+        return torch.matmul(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "oft." + rep
+
+
+class Conv2d(OFTLayer):
+    """OFT implemented in Conv2d layer"""
+
+    def __init__(
+        self,
+        base_layer: nn.Module,
+        adapter_name: str = "default",
+        r: int = 0,
+        module_dropout: float = 0.0,
+        init_weights: bool = True,
+        **kwargs,
+    ):
+        super().__init__(base_layer)
+
+        # Create adapter and set it active
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, r, module_dropout, init_weights, **kwargs)
+
+    def _get_delta_activations(
+        self, adapter_name: str, input: torch.Tensor, *args: Any, **kwargs: Any
+    ) -> torch.Tensor:
+        delta_weight = self.get_delta_weight(adapter_name)
+
+        base_layer = self.get_base_layer()
+        base_weight = base_layer.weight.data
+        delta_weight = delta_weight[: base_weight.shape[0], : base_weight.shape[0]]
+
+        # don't add bias here, because the bias will be added after OFT forward
+        return torch.matmul(input, delta_weight)
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "oft." + rep
diff --git a/MoRA/peft_mora/tuners/oft/model.py b/MoRA/peft_mora/tuners/oft/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27dd7a73a0d05202763689fa1e24a4e37f42047
--- /dev/null
+++ b/MoRA/peft_mora/tuners/oft/model.py
@@ -0,0 +1,106 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Dict, Type, Union
+
+import torch
+from torch import nn
+
+from peft_mora.tuners.lycoris_utils import LycorisConfig, LycorisTuner
+
+from .layer import Conv2d, Linear, OFTLayer
+
+
+class OFTModel(LycorisTuner):
+    """
+    Creates Orthogonal Finetuning model from a pretrained model. The method is described in
+    https://arxiv.org/abs/2306.07280
+
+    Args:
+        model (`torch.nn.Module`): The model to which the adapter tuner layers will be attached.
+        config ([`OFTConfig`]): The configuration of the OFT model.
+        adapter_name (`str`): The name of the adapter, defaults to `"default"`.
+
+    Returns:
+        `torch.nn.Module`: The OFT model.
+
+    Example:
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+        >>> from peft import OFTModel, OFTConfig
+
+        >>> config_te = OFTConfig(
+        ...     r=8,
+        ...     target_modules=["k_proj", "q_proj", "v_proj", "out_proj", "fc1", "fc2"],
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+        >>> config_unet = OFTConfig(
+        ...     r=8,
+        ...     target_modules=[
+        ...         "proj_in",
+        ...         "proj_out",
+        ...         "to_k",
+        ...         "to_q",
+        ...         "to_v",
+        ...         "to_out.0",
+        ...         "ff.net.0.proj",
+        ...         "ff.net.2",
+        ...     ],
+        ...     module_dropout=0.0,
+        ...     init_weights=True,
+        ... )
+
+        >>> model = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> model.text_encoder = OFTModel(model.text_encoder, config_te, "default")
+        >>> model.unet = OFTModel(model.unet, config_unet, "default")
+        ```
+
+    **Attributes**:
+        - **model** ([`~torch.nn.Module`]) -- The model to be adapted.
+        - **peft_config** ([`OFTConfig`]): The configuration of the OFT model.
+    """
+
+    prefix: str = "oft_"
+    layers_mapping: Dict[Type[torch.nn.Module], Type[OFTLayer]] = {
+        torch.nn.Conv2d: Conv2d,
+        torch.nn.Linear: Linear,
+    }
+
+    def _create_and_replace(
+        self,
+        config: LycorisConfig,
+        adapter_name: str,
+        target: Union[OFTLayer, nn.Module],
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+    ) -> None:
+        """
+        A private method to create and replace the target module with the adapter module.
+        """
+
+        # Regexp matching - Find key which matches current target_name in patterns provided
+        pattern_keys = list(config.rank_pattern.keys())
+        target_name_key = next(filter(lambda key: re.match(rf"(.*\.)?{key}$", current_key), pattern_keys), target_name)
+
+        kwargs = config.to_dict()
+        kwargs["r"] = config.rank_pattern.get(target_name_key, config.r)
+
+        if isinstance(target, OFTLayer):
+            target.update_layer(adapter_name, **kwargs)
+        else:
+            new_module = self._create_new_module(config, adapter_name, target, **kwargs)
+            self._replace_module(parent, target_name, new_module, target)
diff --git a/MoRA/peft_mora/tuners/p_tuning/__init__.py b/MoRA/peft_mora/tuners/p_tuning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dd3a6ba3e4442354302c5bfe3da75f1d6f69d02
--- /dev/null
+++ b/MoRA/peft_mora/tuners/p_tuning/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import PromptEncoderConfig, PromptEncoderReparameterizationType
+from .model import PromptEncoder
+
+
+__all__ = ["PromptEncoder", "PromptEncoderConfig", "PromptEncoderReparameterizationType"]
diff --git a/MoRA/peft_mora/tuners/p_tuning/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/p_tuning/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76895b4f965d5be0a8c27fcf19250d1bc5dd5a3e
Binary files /dev/null and b/MoRA/peft_mora/tuners/p_tuning/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/p_tuning/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/p_tuning/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70fa7ea53e8aa01cc82ebda6cc5e40fd599385d2
Binary files /dev/null and b/MoRA/peft_mora/tuners/p_tuning/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/p_tuning/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/p_tuning/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d0f1ff1db4ee9218a1f39d510fbfd9524b0e7
Binary files /dev/null and b/MoRA/peft_mora/tuners/p_tuning/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/p_tuning/config.py b/MoRA/peft_mora/tuners/p_tuning/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6288c1b5e48f747489c43fb1e4067a625facdba
--- /dev/null
+++ b/MoRA/peft_mora/tuners/p_tuning/config.py
@@ -0,0 +1,59 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from dataclasses import dataclass, field
+from typing import Union
+
+from peft_mora.config import PromptLearningConfig
+from peft_mora.utils import PeftType
+
+
+class PromptEncoderReparameterizationType(str, enum.Enum):
+    MLP = "MLP"
+    LSTM = "LSTM"
+
+
+@dataclass
+class PromptEncoderConfig(PromptLearningConfig):
+    """
+    This is the configuration class to store the configuration of a [`PromptEncoder`].
+
+    Args:
+        encoder_reparameterization_type (Union[[`PromptEncoderReparameterizationType`], `str`]):
+            The type of reparameterization to use.
+        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
+        encoder_num_layers (`int`): The number of layers of the prompt encoder.
+        encoder_dropout (`float`): The dropout probability of the prompt encoder.
+    """
+
+    encoder_reparameterization_type: Union[str, PromptEncoderReparameterizationType] = field(
+        default=PromptEncoderReparameterizationType.MLP,
+        metadata={"help": "How to reparameterize the prompt encoder"},
+    )
+    encoder_hidden_size: int = field(
+        default=None,
+        metadata={"help": "The hidden size of the prompt encoder"},
+    )
+    encoder_num_layers: int = field(
+        default=2,
+        metadata={"help": "The number of layers of the prompt encoder"},
+    )
+    encoder_dropout: float = field(
+        default=0.0,
+        metadata={"help": "The dropout of the prompt encoder"},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.P_TUNING
diff --git a/MoRA/peft_mora/tuners/p_tuning/model.py b/MoRA/peft_mora/tuners/p_tuning/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ade2b1128158376c134441687803b85d444cfb96
--- /dev/null
+++ b/MoRA/peft_mora/tuners/p_tuning/model.py
@@ -0,0 +1,130 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Based on https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/nlp/modules/common/prompt_encoder.py
+# with some refactor
+import warnings
+
+import torch
+
+from .config import PromptEncoderConfig, PromptEncoderReparameterizationType
+
+
+class PromptEncoder(torch.nn.Module):
+    """
+    The prompt encoder network that is used to generate the virtual token embeddings for p-tuning.
+
+    Args:
+        config ([`PromptEncoderConfig`]): The configuration of the prompt encoder.
+
+    Example:
+
+    ```py
+    >>> from peft import PromptEncoder, PromptEncoderConfig
+
+    >>> config = PromptEncoderConfig(
+    ...     peft_type="P_TUNING",
+    ...     task_type="SEQ_2_SEQ_LM",
+    ...     num_virtual_tokens=20,
+    ...     token_dim=768,
+    ...     num_transformer_submodules=1,
+    ...     num_attention_heads=12,
+    ...     num_layers=12,
+    ...     encoder_reparameterization_type="MLP",
+    ...     encoder_hidden_size=768,
+    ... )
+
+    >>> prompt_encoder = PromptEncoder(config)
+    ```
+
+    **Attributes**:
+        - **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prompt encoder.
+        - **mlp_head** (`torch.nn.Sequential`) -- The MLP head of the prompt encoder if `inference_mode=False`.
+        - **lstm_head** (`torch.nn.LSTM`) -- The LSTM head of the prompt encoder if `inference_mode=False` and
+        `encoder_reparameterization_type="LSTM"`.
+        - **token_dim** (`int`) -- The hidden embedding dimension of the base transformer model.
+        - **input_size** (`int`) -- The input size of the prompt encoder.
+        - **output_size** (`int`) -- The output size of the prompt encoder.
+        - **hidden_size** (`int`) -- The hidden size of the prompt encoder.
+        - **total_virtual_tokens** (`int`): The total number of virtual tokens of the
+        prompt encoder.
+        - **encoder_type** (Union[[`PromptEncoderReparameterizationType`], `str`]): The encoder type of the prompt
+          encoder.
+
+
+    Input shape: (`batch_size`, `total_virtual_tokens`)
+
+    Output shape: (`batch_size`, `total_virtual_tokens`, `token_dim`)
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.token_dim = config.token_dim
+        self.input_size = self.token_dim
+        self.output_size = self.token_dim
+        self.hidden_size = config.encoder_hidden_size
+        self.total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules
+        self.encoder_type = config.encoder_reparameterization_type
+
+        # embedding
+        self.embedding = torch.nn.Embedding(self.total_virtual_tokens, self.token_dim)
+        if not config.inference_mode:
+            if self.encoder_type == PromptEncoderReparameterizationType.LSTM:
+                lstm_dropout = config.encoder_dropout
+                num_layers = config.encoder_num_layers
+                # LSTM
+                self.lstm_head = torch.nn.LSTM(
+                    input_size=self.input_size,
+                    hidden_size=self.hidden_size,
+                    num_layers=num_layers,
+                    dropout=lstm_dropout,
+                    bidirectional=True,
+                    batch_first=True,
+                )
+
+                self.mlp_head = torch.nn.Sequential(
+                    torch.nn.Linear(self.hidden_size * 2, self.hidden_size * 2),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(self.hidden_size * 2, self.output_size),
+                )
+
+            elif self.encoder_type == PromptEncoderReparameterizationType.MLP:
+                encoder_num_layers_default = PromptEncoderConfig.encoder_num_layers
+                if config.encoder_num_layers != encoder_num_layers_default:
+                    warnings.warn(
+                        f"for {self.encoder_type.value}, the argument `encoder_num_layers` is ignored. "
+                        f"Exactly {encoder_num_layers_default} MLP layers are used."
+                    )
+                layers = [
+                    torch.nn.Linear(self.input_size, self.hidden_size),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(self.hidden_size, self.hidden_size),
+                    torch.nn.ReLU(),
+                    torch.nn.Linear(self.hidden_size, self.output_size),
+                ]
+                self.mlp_head = torch.nn.Sequential(*layers)
+
+            else:
+                raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")
+
+    def forward(self, indices):
+        input_embeds = self.embedding(indices)
+        if self.encoder_type == PromptEncoderReparameterizationType.LSTM:
+            output_embeds = self.mlp_head(self.lstm_head(input_embeds)[0])
+        elif self.encoder_type == PromptEncoderReparameterizationType.MLP:
+            output_embeds = self.mlp_head(input_embeds)
+        else:
+            raise ValueError("Prompt encoder type not recognized. Please use one of MLP (recommended) or LSTM.")
+
+        return output_embeds
diff --git a/MoRA/peft_mora/tuners/poly/__init__.py b/MoRA/peft_mora/tuners/poly/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0f368695edbd7fb7bb3c68d9e918bd16752b873
--- /dev/null
+++ b/MoRA/peft_mora/tuners/poly/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import PolyConfig
+from .layer import Linear, PolyLayer
+from .model import PolyModel
+
+
+__all__ = ["Linear", "PolyConfig", "PolyLayer", "PolyModel"]
diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e4e9c210ea53057952e5cae8141e970bb075e1a
Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36a3b7469e7097f2cc60c1e2824934cbfa8f7511
Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/layer.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/layer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d09270ae7aa7f995cd36a937177701dcd5074ffa
Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/layer.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f89a02e5d2513378911820781a29b0d1bb590b91
Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/poly/__pycache__/router.cpython-312.pyc b/MoRA/peft_mora/tuners/poly/__pycache__/router.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41f0dcb83bae629efb3ffed07de2fb90307b652b
Binary files /dev/null and b/MoRA/peft_mora/tuners/poly/__pycache__/router.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/poly/config.py b/MoRA/peft_mora/tuners/poly/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cdac159f790fce6d6dd8e373e3f277ea01882de
--- /dev/null
+++ b/MoRA/peft_mora/tuners/poly/config.py
@@ -0,0 +1,89 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import List, Literal, Optional, Union
+
+from peft_mora.config import PeftConfig
+from peft_mora.utils import PeftType
+
+
+@dataclass
+class PolyConfig(PeftConfig):
+    """
+    This is the configuration class to store the configuration of a [`PolyModel`].
+        - [Polytropon (Poly)](https://arxiv.org/abs/2202.13914)
+        - [Multi-Head Routing (MHR)](https://arxiv.org/abs/2211.03831)
+
+    Args:
+        r (`int`): Attention dimension of each Lora in Poly.
+        target_modules (`Union[List[str],str]`): The names of the modules to apply Poly to.
+        modules_to_save (`List[str]`): List of modules apart from Poly layers to be set as trainable
+            and saved in the final checkpoint.
+        init_weights (bool): Whether to perform initialization of Poly weights.
+        poly_type (`Literal["poly"]`): The variant of the Poly module to use. Currently, only "poly"
+            is supported.
+        n_tasks (`int`): The number of tasks in a multitasking scenario.
+        n_skills (`int`): The number of skills (LoRA) in each Poly layer.
+        n_splits (`int`): The number of splits within each LoRA of a Poly layer. A value greater
+            than 1 indicates the use of Multi-Head Routing (MHR).
+    """
+
+    r: int = field(default=8, metadata={"help": "Lora attention dimension"})
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with Poly."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    modules_to_save: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of modules apart from Poly layers to be set as trainable and saved in the final checkpoint. "
+            "For example, in Sequence Classification or Token Classification tasks, "
+            "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
+        },
+    )
+    init_weights: bool = field(
+        default=True,
+        metadata={
+            "help": (
+                "Whether to initialize the weights of the Poly layers with their default initialization. Don't change "
+                "this setting, except if you know exactly what you're doing."
+            ),
+        },
+    )
+    poly_type: Literal["poly"] = field(
+        default="poly",
+        metadata={"help": 'Type of Poly modules to be used. Currently only "poly" is supported.'},
+    )
+    n_tasks: int = field(
+        default=1,
+        metadata={"help": "Number of tasks in multitasking scenario."},
+    )
+    n_skills: int = field(
+        default=4,
+        metadata={"help": "Number of skills (LoRA) in each Poly layer."},
+    )
+    n_splits: int = field(
+        default=1,
+        metadata={"help": "Number of splits within each LoRA of a Poly layer."},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.POLY
+        self.target_modules = (
+            set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules
+        )
diff --git a/MoRA/peft_mora/tuners/poly/layer.py b/MoRA/peft_mora/tuners/poly/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0455c8730f6889be92da7f8b74221d2ca5b8a73e
--- /dev/null
+++ b/MoRA/peft_mora/tuners/poly/layer.py
@@ -0,0 +1,171 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any
+
+import torch
+import torch.nn as nn
+
+from peft_mora.tuners.tuners_utils import BaseTunerLayer
+
+from .config import PolyConfig
+from .router import get_router
+
+
+class PolyLayer(BaseTunerLayer):
+    # All names of layers that may contain (trainable) adapter weights
+    adapter_layer_names = ("poly_lora_A", "poly_lora_B", "poly_router")
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names = ("r", "n_tasks", "n_skills", "n_splits")
+
+    def __init__(self, base_layer: nn.Module, **kwargs):
+        self.base_layer = base_layer
+        self.r = {}
+        self.n_tasks = {}
+        self.n_skills = {}
+        self.n_splits = {}
+        self.poly_type = {}
+        self.poly_router = nn.ModuleDict()
+        self.poly_lora_A = nn.ParameterDict()
+        self.poly_lora_B = nn.ParameterDict()
+        self.kwargs = kwargs
+
+        base_layer = self.get_base_layer()
+        if isinstance(base_layer, nn.Linear):
+            in_features, out_features = base_layer.in_features, base_layer.out_features
+        else:
+            raise ValueError(f"Unsupported layer type {type(base_layer)}")
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+    def update_layer(self, adapter_name, poly_config):
+        if poly_config.r <= 0:
+            raise ValueError(f"`r` should be a positive integer value but the value passed is {poly_config.r}")
+
+        self.r[adapter_name] = poly_config.r
+        self.n_tasks[adapter_name] = poly_config.n_tasks
+        self.n_skills[adapter_name] = poly_config.n_skills
+        self.n_splits[adapter_name] = poly_config.n_splits
+        self.poly_type[adapter_name] = poly_config.poly_type
+
+        self.poly_lora_A[adapter_name] = nn.Parameter(
+            torch.empty(
+                poly_config.n_splits,
+                poly_config.n_skills,
+                self.in_features // poly_config.n_splits,
+                poly_config.r,
+            )
+        )
+        self.poly_lora_B[adapter_name] = nn.Parameter(
+            torch.empty(
+                poly_config.n_splits,
+                poly_config.n_skills,
+                poly_config.r,
+                self.out_features // poly_config.n_splits,
+            )
+        )
+        self.poly_router[adapter_name] = get_router(poly_config)
+
+        self.reset_poly_parameters(adapter_name, init_weights=poly_config.init_weights)
+
+        weight = getattr(self.get_base_layer(), "weight", None)
+        if weight is not None:
+            # the layer is already completely initialized, this is an update
+            if weight.dtype.is_floating_point or weight.dtype.is_complex:
+                self.to(weight.device, dtype=weight.dtype)
+            else:
+                self.to(weight.device)
+        self.set_adapter(self.active_adapters)
+
+    def reset_poly_parameters(self, adapter_name, init_weights):
+        if adapter_name in self.poly_lora_A.keys():
+            # initialize A the same way as the default for nn.Linear
+            # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L269
+            n_splits, n_skills, d, r = self.poly_lora_A[adapter_name].shape
+            for skill in range(n_skills):
+                for split in range(n_splits):
+                    param = torch.empty((r, d))
+                    torch.nn.init.kaiming_uniform_(param, a=math.sqrt(5))
+                    self.poly_lora_A[adapter_name].data[split, skill, :, :] = param.T
+
+            if init_weights:
+                # initialize B to zero
+                torch.nn.init.zeros_(self.poly_lora_B[adapter_name])
+            else:
+                # initialize B the same way as the default for nn.Linear
+                n_splits, n_skills, r, d = self.poly_lora_B[adapter_name].shape
+                for skill in range(n_skills):
+                    for split in range(n_splits):
+                        param = torch.empty((d, r))
+                        torch.nn.init.kaiming_uniform_(param, a=math.sqrt(5))
+                        self.poly_lora_B[adapter_name].data[split, skill, :, :] = param.T
+
+            # initialized router
+            self.poly_router[adapter_name].reset()
+
+
+class Linear(nn.Module, PolyLayer):
+    # Lora implemented in a dense layer
+    def __init__(
+        self,
+        base_layer,
+        adapter_name: str,
+        poly_config: PolyConfig,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        PolyLayer.__init__(self, base_layer, **kwargs)
+
+        self._active_adapter = adapter_name
+        self.update_layer(adapter_name, poly_config)
+
+    def forward(self, x: torch.Tensor, *args: Any, task_ids: torch.Tensor = None, **kwargs: Any) -> torch.Tensor:
+        previous_dtype = x.dtype
+        if self.disable_adapters:
+            result = self.base_layer(x, *args, **kwargs)
+        else:
+            result = self.base_layer(x, *args, **kwargs)
+            for active_adapter in self.active_adapters:
+                if active_adapter not in self.poly_lora_A.keys():
+                    continue
+
+                r = self.r[active_adapter]
+                poly_router = self.poly_router[active_adapter]
+                poly_lora_A = self.poly_lora_A[active_adapter]
+                poly_lora_B = self.poly_lora_B[active_adapter]
+
+                # Combine the output of LoRAs
+                # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L293
+                mixing_weights = poly_router(task_ids=task_ids, input_ids=x)
+                bs, n_splits, n_skills = mixing_weights.size()
+
+                # A is    n_splits, n_skills, D // n_splits, rank
+                # we want bs,       n_splits, D // n_splits, rank
+                A = torch.einsum("bqs,qsdr->bqdr", (mixing_weights, poly_lora_A))
+                B = torch.einsum("bqs,qsrd->bqrd", (mixing_weights, poly_lora_B))
+
+                A = A.reshape(bs, self.in_features, r)
+                B = B.transpose(1, 2).reshape(bs, r, self.out_features)
+
+                x = x.to(A.dtype)
+                result += x.bmm(A).bmm(B) / r
+
+        result = result.to(previous_dtype)
+        return result
+
+    def __repr__(self) -> str:
+        rep = super().__repr__()
+        return "poly." + rep
diff --git a/MoRA/peft_mora/tuners/poly/model.py b/MoRA/peft_mora/tuners/poly/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a00e98a44623939cbb36a2e4269ceef79a622b2
--- /dev/null
+++ b/MoRA/peft_mora/tuners/poly/model.py
@@ -0,0 +1,187 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import contextmanager
+from dataclasses import asdict
+from enum import Enum
+from typing import Any
+
+import torch
+from torch import nn
+
+from peft_mora.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists
+from peft_mora.utils import (
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+    ModulesToSaveWrapper,
+)
+
+from .config import PolyConfig
+from .layer import Linear, PolyLayer
+
+
+class PolyModel(BaseTuner):
+    prefix: str = "poly_"
+
+    def __init__(self, model, config, adapter_name) -> None:
+        super().__init__(model, config, adapter_name)
+
+    @staticmethod
+    def _check_target_module_exists(poly_config, key):
+        return check_target_module_exists(poly_config, key)
+
+    def _create_and_replace(
+        self,
+        poly_config: PolyConfig,
+        adapter_name: str,
+        target: nn.Module,
+        target_name: str,
+        parent: nn.Module,
+        **optional_kwargs: Any,
+    ):
+        if isinstance(target, PolyLayer):
+            target.update_layer(adapter_name, poly_config)
+        else:
+            new_module = self._create_new_module(
+                poly_config,
+                adapter_name,
+                target,
+            )
+            if adapter_name != self.active_adapter:
+                # adding an additional adapter: it is not automatically trainable
+                new_module.requires_grad_(False)
+            self._replace_module(parent, target_name, new_module, target)
+
+    def _replace_module(self, parent, child_name, new_module, child):
+        setattr(parent, child_name, new_module)
+        # It's not necessary to set requires_grad here, as that is handled by
+        # _mark_only_adapters_as_trainable
+
+        # child layer wraps the original module, unpack it
+        if hasattr(child, "base_layer"):
+            child = child.base_layer
+
+        if not hasattr(new_module, "base_layer"):
+            new_module.weight = child.weight
+            if hasattr(child, "bias"):
+                new_module.bias = child.bias
+
+        if getattr(child, "state", None) is not None:
+            if hasattr(new_module, "base_layer"):
+                new_module.base_layer.state = child.state
+            else:
+                new_module.state = child.state
+            new_module.to(child.weight.device)
+
+        # dispatch to correct device
+        for name, module in new_module.named_modules():
+            if (self.prefix in name) or ("ranknum" in name):
+                weight = child.qweight if hasattr(child, "qweight") else child.weight
+                module.to(weight.device)
+
+    def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None:
+        for n, p in model.named_parameters():
+            if self.prefix not in n:
+                p.requires_grad = False
+
+    @staticmethod
+    def _create_new_module(poly_config, adapter_name, target, **kwargs):
+        if isinstance(target, BaseTunerLayer):
+            target_base_layer = target.get_base_layer()
+        else:
+            target_base_layer = target
+
+        if isinstance(target_base_layer, torch.nn.Linear):
+            return Linear(target, adapter_name, poly_config, **kwargs)
+        else:
+            raise ValueError(
+                f"Target module {target} is not supported. Currently, only the following modules are supported: "
+                "`torch.nn.Linear`."
+            )
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Module's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def get_peft_config_as_dict(self, inference: bool = False):
+        config_dict = {}
+        for key, value in self.peft_config.items():
+            config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()}
+            if inference:
+                config["inference_mode"] = True
+        config_dict[key] = config
+        return config
+
+    def _set_adapter_layers(self, enabled=True):
+        for module in self.model.modules():
+            if isinstance(module, (PolyLayer, ModulesToSaveWrapper)):
+                module.enable_adapters(enabled)
+
+    def enable_adapter_layers(self):
+        self._set_adapter_layers(enabled=True)
+
+    def disable_adapter_layers(self):
+        self._set_adapter_layers(enabled=False)
+
+    def set_adapter(self, adapter_name):
+        for module in self.model.modules():
+            if isinstance(module, PolyLayer):
+                module.set_adapter(adapter_name)
+
+    def _prepare_adapter_config(self, peft_config, model_config):
+        if peft_config.target_modules is None:
+            if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING:
+                raise ValueError("Please specify `target_modules` in `peft_config`")
+            peft_config.target_modules = set(
+                TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]]
+            )
+        return peft_config
+
+    def _register_pre_hooks(self, task_ids):
+        """Helper method to register pre hooks."""
+        if task_ids is None:
+            return []
+
+        def pre_hook(_, args, kwargs):
+            kwargs["task_ids"] = task_ids
+            return args, kwargs
+
+        handles = []
+
+        for module in self.model.modules():
+            if isinstance(module, Linear):
+                handle = module.register_forward_pre_hook(pre_hook, with_kwargs=True)
+                handles.append(handle)
+
+        return handles
+
+    @contextmanager
+    def _manage_pre_hooks(self, task_ids):
+        """Context manager to handle the lifecycle of pre hooks."""
+        handles = self._register_pre_hooks(task_ids)
+        try:
+            yield
+        finally:
+            for handle in handles:
+                handle.remove()
+
+    def forward(self, *args, task_ids=None, **kwargs):
+        with self._manage_pre_hooks(task_ids):
+            return self.model(*args, **kwargs)
+
+    def generate(self, *args, task_ids=None, **kwargs):
+        with self._manage_pre_hooks(task_ids):
+            return self.model.generate(*args, **kwargs)
diff --git a/MoRA/peft_mora/tuners/poly/router.py b/MoRA/peft_mora/tuners/poly/router.py
new file mode 100644
index 0000000000000000000000000000000000000000..0249398a9fc36d53bc0b4f022a8410514688a9f1
--- /dev/null
+++ b/MoRA/peft_mora/tuners/poly/router.py
@@ -0,0 +1,83 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+
+import torch
+from torch import nn
+from torch.distributions.relaxed_bernoulli import RelaxedBernoulli
+
+from .config import PolyConfig
+
+
+EPS = 1e-12
+
+
+def get_router(poly_config: PolyConfig) -> nn.Module:
+    if poly_config.poly_type == "poly":
+        return PolyRouter(poly_config)
+    else:
+        raise ValueError(
+            f"Unsupported poly_type: {poly_config.poly_type}. "
+            "Currently, only the following types are supported: "
+            "`poly`."
+        )
+
+
+class Router(nn.Module, ABC):
+    @abstractmethod
+    def reset(self):
+        ...
+
+    @abstractmethod
+    def forward(self, task_ids: torch.Tensor, input_ids: torch.Tensor):
+        ...
+
+
+class PolyRouter(Router):
+    # It's a simplified implementation of
+    # https://github.com/microsoft/mttl/blob/ce4ca51dbca73be656feb9b3e5233633e3c5dec7/mttl/models/poly.py#L138
+    def __init__(self, poly_config: PolyConfig):
+        super().__init__()
+
+        self.poly_type = poly_config.poly_type
+        self.n_tasks = poly_config.n_tasks
+        self.n_skills = poly_config.n_skills
+        self.n_splits = poly_config.n_splits
+
+        self.module_logits = nn.Parameter(torch.empty((self.n_tasks, self.n_splits * self.n_skills)))
+
+    def reset(self):
+        torch.nn.init.uniform_(self.module_logits, -1e-3, 1e-3)
+
+    def forward(self, task_ids: torch.Tensor, input_ids: torch.Tensor):
+        if task_ids is None:
+            raise ValueError("task_ids should not be None.")
+        if task_ids.max().item() >= self.n_tasks:
+            raise ValueError(f"Only {self.n_tasks} tasks available. Found task id = {task_ids.max().item()}")
+
+        # move task id to input's device
+        task_ids = task_ids.to(self.module_logits.device)
+
+        module_logits = self.module_logits[task_ids]
+        module_logits = module_logits.view(-1, self.n_splits, self.n_skills)
+
+        if self.training:
+            module_logits = RelaxedBernoulli(temperature=1.0, logits=module_logits).rsample()
+        else:
+            module_logits = torch.sigmoid(module_logits)
+
+        module_weights = module_logits / (module_logits.sum(dim=-1, keepdim=True) + EPS)
+
+        return module_weights
diff --git a/MoRA/peft_mora/tuners/prefix_tuning/__init__.py b/MoRA/peft_mora/tuners/prefix_tuning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..28f4bedbb43bcf2b22146d60e0e1f2fe7b19d9eb
--- /dev/null
+++ b/MoRA/peft_mora/tuners/prefix_tuning/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import PrefixTuningConfig
+from .model import PrefixEncoder
+
+
+__all__ = ["PrefixTuningConfig", "PrefixEncoder"]
diff --git a/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60cf0f0394986f800330ad85d41614acc5b46346
Binary files /dev/null and b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4fc20711f87961d6e34cf655396f247d65364b03
Binary files /dev/null and b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..314c191115a1a8f766e220fc2e924c4922b5eb31
Binary files /dev/null and b/MoRA/peft_mora/tuners/prefix_tuning/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/prefix_tuning/config.py b/MoRA/peft_mora/tuners/prefix_tuning/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..24ea1ebcb5db96177d39c2277c77f1e839df4703
--- /dev/null
+++ b/MoRA/peft_mora/tuners/prefix_tuning/config.py
@@ -0,0 +1,41 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from peft_mora.config import PromptLearningConfig
+from peft_mora.utils import PeftType
+
+
+@dataclass
+class PrefixTuningConfig(PromptLearningConfig):
+    """
+    This is the configuration class to store the configuration of a [`PrefixEncoder`].
+
+    Args:
+        encoder_hidden_size (`int`): The hidden size of the prompt encoder.
+        prefix_projection (`bool`): Whether to project the prefix embeddings.
+    """
+
+    encoder_hidden_size: int = field(
+        default=None,
+        metadata={"help": "The hidden size of the encoder"},
+    )
+    prefix_projection: bool = field(
+        default=False,
+        metadata={"help": "Whether to project the prefix tokens"},
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.PREFIX_TUNING
diff --git a/MoRA/peft_mora/tuners/prefix_tuning/model.py b/MoRA/peft_mora/tuners/prefix_tuning/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffd51892a3cc074406791f6bc7d1b088d25148e3
--- /dev/null
+++ b/MoRA/peft_mora/tuners/prefix_tuning/model.py
@@ -0,0 +1,80 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Based on https://github.com/THUDM/P-tuning-v2/blob/main/model/prefix_encoder.py
+# with some refactor
+import torch
+
+
+class PrefixEncoder(torch.nn.Module):
+    r"""
+    The `torch.nn` model to encode the prefix.
+
+    Args:
+        config ([`PrefixTuningConfig`]): The configuration of the prefix encoder.
+
+    Example:
+
+    ```py
+    >>> from peft import PrefixEncoder, PrefixTuningConfig
+
+    >>> config = PrefixTuningConfig(
+    ...     peft_type="PREFIX_TUNING",
+    ...     task_type="SEQ_2_SEQ_LM",
+    ...     num_virtual_tokens=20,
+    ...     token_dim=768,
+    ...     num_transformer_submodules=1,
+    ...     num_attention_heads=12,
+    ...     num_layers=12,
+    ...     encoder_hidden_size=768,
+    ... )
+    >>> prefix_encoder = PrefixEncoder(config)
+    ```
+
+    **Attributes**:
+        - **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prefix encoder.
+        - **transform** (`torch.nn.Sequential`) -- The two-layer MLP to transform the prefix embeddings if
+          `prefix_projection` is `True`.
+        - **prefix_projection** (`bool`) -- Whether to project the prefix embeddings.
+
+    Input shape: (`batch_size`, `num_virtual_tokens`)
+
+    Output shape: (`batch_size`, `num_virtual_tokens`, `2*layers*hidden`)
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        token_dim = config.token_dim
+        num_layers = config.num_layers
+        encoder_hidden_size = config.encoder_hidden_size
+        num_virtual_tokens = config.num_virtual_tokens
+        if self.prefix_projection and not config.inference_mode:
+            # Use a two-layer MLP to encode the prefix
+            self.embedding = torch.nn.Embedding(num_virtual_tokens, token_dim)
+            self.transform = torch.nn.Sequential(
+                torch.nn.Linear(token_dim, encoder_hidden_size),
+                torch.nn.Tanh(),
+                torch.nn.Linear(encoder_hidden_size, num_layers * 2 * token_dim),
+            )
+        else:
+            self.embedding = torch.nn.Embedding(num_virtual_tokens, num_layers * 2 * token_dim)
+
+    def forward(self, prefix: torch.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.transform(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
diff --git a/MoRA/peft_mora/tuners/prompt_tuning/__init__.py b/MoRA/peft_mora/tuners/prompt_tuning/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71795b61d819573ff41770e6d49c750e6c51b0ae
--- /dev/null
+++ b/MoRA/peft_mora/tuners/prompt_tuning/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .config import PromptTuningConfig, PromptTuningInit
+from .model import PromptEmbedding
+
+
+__all__ = ["PromptTuningConfig", "PromptEmbedding", "PromptTuningInit"]
diff --git a/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70ef65eb7be3dc096910e28bd2c39f3919aacc4e
Binary files /dev/null and b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/config.cpython-312.pyc b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/config.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..388b3710f0c9b819ddcfbbcf0651026e0e980dfb
Binary files /dev/null and b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/config.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/model.cpython-312.pyc b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/model.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe6c76031de3e9b2706dee5fb2891775bdf901c6
Binary files /dev/null and b/MoRA/peft_mora/tuners/prompt_tuning/__pycache__/model.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/tuners/prompt_tuning/config.py b/MoRA/peft_mora/tuners/prompt_tuning/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf5e2303ac4ecfb9bf2befbc0d21b00a18bea663
--- /dev/null
+++ b/MoRA/peft_mora/tuners/prompt_tuning/config.py
@@ -0,0 +1,77 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from dataclasses import dataclass, field
+from typing import Optional, Union
+
+from peft_mora.config import PromptLearningConfig
+from peft_mora.utils import PeftType
+
+
+class PromptTuningInit(str, enum.Enum):
+    TEXT = "TEXT"
+    RANDOM = "RANDOM"
+
+
+@dataclass
+class PromptTuningConfig(PromptLearningConfig):
+    """
+    This is the configuration class to store the configuration of a [`PromptEmbedding`].
+
+    Args:
+        prompt_tuning_init (Union[[`PromptTuningInit`], `str`]): The initialization of the prompt embedding.
+        prompt_tuning_init_text (`str`, *optional*):
+            The text to initialize the prompt embedding. Only used if `prompt_tuning_init` is `TEXT`.
+        tokenizer_name_or_path (`str`, *optional*):
+            The name or path of the tokenizer. Only used if `prompt_tuning_init` is `TEXT`.
+        tokenizer_kwargs (`dict`, *optional*):
+            The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if `prompt_tuning_init` is
+            `TEXT`.
+    """
+
+    prompt_tuning_init: Union[PromptTuningInit, str] = field(
+        default=PromptTuningInit.RANDOM,
+        metadata={"help": "How to initialize the prompt tuning parameters"},
+    )
+    prompt_tuning_init_text: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The text to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
+        },
+    )
+    tokenizer_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The tokenizer to use for prompt tuning initialization. Only used if prompt_tuning_init is `TEXT`"
+        },
+    )
+
+    tokenizer_kwargs: Optional[dict] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The keyword arguments to pass to `AutoTokenizer.from_pretrained`. Only used if prompt_tuning_init is "
+                "`TEXT`"
+            ),
+        },
+    )
+
+    def __post_init__(self):
+        self.peft_type = PeftType.PROMPT_TUNING
+
+        if self.tokenizer_kwargs and (self.prompt_tuning_init != PromptTuningInit.TEXT):
+            raise ValueError(
+                f"tokenizer_kwargs only valid when using prompt_tuning_init='{PromptTuningInit.TEXT.value}'."
+            )
diff --git a/MoRA/peft_mora/tuners/prompt_tuning/model.py b/MoRA/peft_mora/tuners/prompt_tuning/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a04221c2abfd1fb806df2805a7a28e4e3073a32d
--- /dev/null
+++ b/MoRA/peft_mora/tuners/prompt_tuning/model.py
@@ -0,0 +1,89 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+
+from .config import PromptTuningInit
+
+
+class PromptEmbedding(torch.nn.Module):
+    """
+    The model to encode virtual tokens into prompt embeddings.
+
+    Args:
+        config ([`PromptTuningConfig`]): The configuration of the prompt embedding.
+        word_embeddings (`torch.nn.Module`): The word embeddings of the base transformer model.
+
+    **Attributes**:
+        - **embedding** (`torch.nn.Embedding`) -- The embedding layer of the prompt embedding.
+
+    Example:
+
+    ```py
+    >>> from peft import PromptEmbedding, PromptTuningConfig
+
+    >>> config = PromptTuningConfig(
+    ...     peft_type="PROMPT_TUNING",
+    ...     task_type="SEQ_2_SEQ_LM",
+    ...     num_virtual_tokens=20,
+    ...     token_dim=768,
+    ...     num_transformer_submodules=1,
+    ...     num_attention_heads=12,
+    ...     num_layers=12,
+    ...     prompt_tuning_init="TEXT",
+    ...     prompt_tuning_init_text="Predict if sentiment of this review is positive, negative or neutral",
+    ...     tokenizer_name_or_path="t5-base",
+    ... )
+
+    >>> # t5_model.shared is the word embeddings of the base model
+    >>> prompt_embedding = PromptEmbedding(config, t5_model.shared)
+    ```
+
+    Input Shape: (`batch_size`, `total_virtual_tokens`)
+
+    Output Shape: (`batch_size`, `total_virtual_tokens`, `token_dim`)
+    """
+
+    def __init__(self, config, word_embeddings):
+        super().__init__()
+
+        total_virtual_tokens = config.num_virtual_tokens * config.num_transformer_submodules
+        self.embedding = torch.nn.Embedding(total_virtual_tokens, config.token_dim)
+        if config.prompt_tuning_init == PromptTuningInit.TEXT and not config.inference_mode:
+            from transformers import AutoTokenizer
+
+            tokenizer_kwargs = config.tokenizer_kwargs or {}
+            tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name_or_path, **tokenizer_kwargs)
+            init_text = config.prompt_tuning_init_text
+            init_token_ids = tokenizer(init_text)["input_ids"]
+            # Trim or iterate until num_text_tokens matches total_virtual_tokens
+            num_text_tokens = len(init_token_ids)
+            if num_text_tokens > total_virtual_tokens:
+                init_token_ids = init_token_ids[:total_virtual_tokens]
+            elif num_text_tokens < total_virtual_tokens:
+                num_reps = math.ceil(total_virtual_tokens / num_text_tokens)
+                init_token_ids = init_token_ids * num_reps
+            init_token_ids = init_token_ids[:total_virtual_tokens]
+            init_token_ids = torch.LongTensor(init_token_ids).to(word_embeddings.weight.device)
+
+            word_embedding_weights = word_embeddings(init_token_ids).detach().clone()
+            word_embedding_weights = word_embedding_weights.to(torch.float32)
+            self.embedding.weight = torch.nn.Parameter(word_embedding_weights)
+
+    def forward(self, indices):
+        # Just get embeddings
+        prompt_embeddings = self.embedding(indices)
+        return prompt_embeddings
diff --git a/MoRA/peft_mora/tuners/tuners_utils.py b/MoRA/peft_mora/tuners/tuners_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f0437dbc1699178cc41af91281b5352912701dc
--- /dev/null
+++ b/MoRA/peft_mora/tuners/tuners_utils.py
@@ -0,0 +1,667 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import logging
+import re
+import warnings
+from abc import ABC, abstractmethod
+from contextlib import contextmanager
+from typing import Any, Optional, Union
+
+import torch
+from accelerate.hooks import AlignDevicesHook
+from accelerate.utils import named_module_tensors, offload_state_dict
+from torch import nn
+from transformers import PreTrainedModel
+from transformers.pytorch_utils import Conv1D
+
+from peft_mora.utils import INCLUDE_LINEAR_LAYERS_SHORTHAND
+
+from ..config import PeftConfig
+from ..utils import ModulesToSaveWrapper, _get_submodules
+
+
+logger = logging.getLogger(__name__)
+
+
+@contextmanager
+def onload_layer(layer):
+    r"""
+    A utility for modifying a module containing one or more tuners and a base layer, any of which are offloaded to the
+    CPU or disk. Moves a module's sub-modules to the execution device before some action is performed, after that the
+    base layer state dictionary is re-assigned (if that layer was offloaded to the disk) and finally the parameters are
+    offloaded.
+
+    If the module has no offloaded sub-modules, this function does nothing.
+
+    Args:
+        layer ('torch.nn.Module'):
+            layer with tuners to be merged
+    """
+
+    offloaded_modules = []
+    for name, module in layer.named_modules():
+        if name in ["", "base_layer"]:
+            continue
+        if hasattr(module, "_hf_hook") and isinstance(module._hf_hook, AlignDevicesHook) and module._hf_hook.offload:
+            module._hf_hook.pre_forward(module)
+            offloaded_modules.append(module)
+
+    base_layer_offload = False
+    if hasattr(layer, "base_layer") and (
+        hasattr(layer.base_layer, "_hf_hook")
+        and isinstance(layer.base_layer._hf_hook, AlignDevicesHook)
+        and layer.base_layer._hf_hook.offload
+    ):
+        if torch.device("meta") in layer.base_layer._hf_hook.original_devices.values():
+            # retrieve the name of the original disk-offload directory
+            offload_folder = layer.base_layer._hf_hook.weights_map.dataset.save_folder
+        layer.base_layer._hf_hook.pre_forward(layer.base_layer)
+        base_layer_offload = True
+
+    yield
+
+    for module in offloaded_modules:
+        module._hf_hook.post_forward(module, torch.tensor([]))
+
+    if base_layer_offload:
+        # re-make weights map (must be on cpu to send params to the disk via memmap if disk offload)
+        layer.base_layer._hf_hook.weights_map = {
+            name: param.to("cpu") for name, param in named_module_tensors(layer.base_layer)
+        }
+        # offload weights map to disk if original device is the disk
+        if torch.device("meta") in layer.base_layer._hf_hook.original_devices.values():
+            # rewrite directory with merged weights
+            offload_state_dict(offload_folder, layer.base_layer._hf_hook.weights_map)
+        layer.base_layer._hf_hook.post_forward(layer.base_layer, torch.tensor([]))
+
+
+class BaseTuner(nn.Module, ABC):
+    r"""
+    A base tuner model that provides the common methods and attributes for all tuners that are injectable into a
+    torch.nn.Module
+
+    For adding a new Tuner class, one needs to overwrite the following methods:
+
+    - **_prepare_adapter_config**:
+        A private method to eventually prepare the adapter config, for example in case the field `target_modules` is
+        missing.
+    - **_create_and_replace**:
+        A private method to create and replace the target module with the adapter module.
+    - **_check_target_module_exists**:
+        A private helper method to check if the passed module's key name matches any of the target modules in the
+        adapter_config.
+
+    The easiest is to check what is done in the `peft.tuners.lora.LoraModel` class.
+
+    Attributes:
+        model (`torch.nn.Module`):
+            The model to which the adapter tuner layers will be attached.
+        forward (`Callable`):
+            The forward method of the model.
+        peft_config (`Union[`PeftConfig`, dict[str, PeftConfig]]`):
+            The adapter configuration object, it should be a dictionary of `str` to `PeftConfig` objects. One can also
+            pass a PeftConfig object and a new adapter will be created with the default name `adapter` or create a new
+            dictionary with a key `adapter_name` and a value of that peft config.
+        config (`dict[str, Any]`):
+            The model configuration object, it should be a dictionary of `str` to `Any` objects.
+        targeted_module_names (`list[str]`):
+            The list of module names that were actually adapted. Can be useful to inspect if you want to quickly
+            double-check that the `config.target_modules` where specified correctly.
+    """
+
+    def __init__(self, model, peft_config: Union[PeftConfig, dict[str, PeftConfig]], adapter_name: str) -> None:
+        super().__init__()
+
+        self.model = model
+        self.targeted_module_names: list[str] = []
+
+        # For advanced developers, if you want to attach multiple adapters to your
+        # model, just add a `peft_config` dict attribute to your model.
+        if not hasattr(self, "peft_config"):
+            self.peft_config = {adapter_name: peft_config} if isinstance(peft_config, PeftConfig) else peft_config
+        else:
+            logger.info(
+                "Already found a `peft_config` attribute in the model. This will lead to having multiple adapters"
+                " in the model. Make sure to know what you are doing!"
+            )
+            if isinstance(peft_config, PeftConfig):
+                self.peft_config[adapter_name] = peft_config
+            else:
+                # user is adding a dict of PeftConfigs
+                self.peft_config.update(peft_config)
+
+        self.active_adapter = adapter_name
+        self.inject_adapter(self.model, adapter_name)
+
+        # Copy the peft_config in the injected model.
+        self.model.peft_config = self.peft_config
+
+    @property
+    def active_adapters(self) -> list[str]:
+        if isinstance(self.active_adapter, str):
+            return [self.active_adapter]
+        # is already a list of str
+        return self.active_adapter
+
+    def forward(self, *args: Any, **kwargs: Any):
+        return self.model.forward(*args, **kwargs)
+
+    @abstractmethod
+    def _prepare_adapter_config(self, peft_config: PeftConfig, model_config: dict) -> PeftConfig:
+        r"""
+        A private method to eventually prepare the adapter config. For transformers based models, if
+        `peft_config.target_modules` is None, we can automatically infer the target modules from the
+        `TRANSFORMERS_MODELS_TO_XXX_TARGET_MODULES_MAPPING`. This method can be further refactored in the future to
+        automatically infer it for all tuner models.
+
+        Check out `peft.tuner.lora.LoraModel._prepare_adapter_config` for an example.
+
+        Args:
+            peft_config (`str`):
+                The adapter config.
+            model_config (`str`):
+                The transformers model config, that config should contain the `model_type` key.
+        """
+        ...
+
+    @abstractmethod
+    def _check_target_module_exists(peft_config: PeftConfig, key: str) -> bool:
+        r"""
+        A helper private method to check if the passed module's key name matches any of the target modules in the
+        `peft_config.target_modules` list. If it does, return `True`, else return `False`.
+
+        Args:
+            peft_config (`PeftConfig`):
+                The adapter config.
+            key (`str`):
+                The module's key name.
+        """
+        ...
+
+    @abstractmethod
+    def _create_and_replace(
+        self,
+        peft_config: PeftConfig,
+        adapter_name: str,
+        target: nn.Module,
+        target_name: str,
+        parent: nn.Module,
+        current_key: str,
+    ) -> None:
+        r"""
+        Inplace replacement of the target module with the adapter layer. This method needs to be overridden by all the
+        tuner classes.
+
+        Check `peft.tuners.lora.LoraModel._create_and_replace` for an example.
+
+        Args:
+            peft_config (`PeftConfig`):
+                The adapter config.
+            adapter_name (`str`):
+                The adapter name.
+            target (`nn.Module`):
+                The target module.
+            target_name (`str`):
+                The target module's name.
+            parent (`nn.Module`):
+                The parent module.
+            current_key (`str`):
+                The key of the current target being adapted.
+        """
+        ...
+
+    @abstractmethod
+    def _mark_only_adapters_as_trainable(self, model: nn.Module):
+        r"""
+        A helper method to mark only the adapter layers as trainable (i.e. module.requires_grad = False) This needs to
+        be overridden for all tuner classes to match the correct key names.
+
+        Check `peft.tuners.lora.LoraModel._mark_only_adapters_as_trainable` for an example.
+        """
+        ...
+
+    def _check_new_adapter_config(self, config: PeftConfig) -> None:
+        """
+        A helper method to check the config when a new adapter is being added.
+
+        Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters.
+
+        """
+        pass
+
+    def inject_adapter(self, model: nn.Module, adapter_name: str):
+        r"""
+        Creates adapter layers and replaces the target modules with the adapter layers. This method is called under the
+        hood by `peft.mapping.get_peft_model` if a non-prompt tuning adapter class is passed.
+
+        The corresponding PEFT config is directly retrieved from the `peft_config` attribute of the BaseTuner class.
+
+        Args:
+            model (`nn.Module`):
+                The model to be tuned.
+            adapter_name (`str`):
+                The adapter name.
+        """
+        peft_config = self.peft_config[adapter_name]
+        # Note: If possible, all checks should be performed *at the start of this method*.
+        # This way, we can raise early if something goes wrong, without leaving the model
+        # in a bad (half-initialized) state.
+        self._check_new_adapter_config(peft_config)
+
+        is_target_modules_in_base_model = False
+        key_list = [key for key, _ in model.named_modules()]
+
+        _check_for_modules_to_save = getattr(peft_config, "modules_to_save", None) is not None
+        _has_modules_to_save = False
+
+        model_config = getattr(model, "config", {"model_type": "custom"})
+        if hasattr(model_config, "to_dict"):
+            model_config = model_config.to_dict()
+
+        peft_config = self._prepare_adapter_config(peft_config, model_config)
+
+        # update peft_config.target_modules if required
+        peft_config = _maybe_include_all_linear_layers(peft_config, model)
+
+        for key in key_list:
+            # Check for modules_to_save in case
+            if _check_for_modules_to_save and any(
+                key.endswith(f"{module_to_save}") for module_to_save in peft_config.modules_to_save
+            ):
+                # Optionally set the modules to save
+                parent, target, target_name = _get_submodules(model, key)
+
+                if not isinstance(target, ModulesToSaveWrapper):
+                    new_module = ModulesToSaveWrapper(target, adapter_name)
+                    setattr(parent, target_name, new_module)
+                else:
+                    target.update(adapter_name)
+
+                _has_modules_to_save = True
+                continue
+
+            if not self._check_target_module_exists(peft_config, key):
+                continue
+
+            self.targeted_module_names.append(key)
+            is_target_modules_in_base_model = True
+            parent, target, target_name = _get_submodules(model, key)
+            self._create_and_replace(peft_config, adapter_name, target, target_name, parent, current_key=key)
+
+        if not is_target_modules_in_base_model:
+            raise ValueError(
+                f"Target modules {peft_config.target_modules} not found in the base model. "
+                f"Please check the target modules and try again."
+            )
+
+        self._mark_only_adapters_as_trainable(model)
+
+        if self.peft_config[adapter_name].inference_mode:
+            for n, p in model.named_parameters():
+                if adapter_name in n:
+                    p.requires_grad = False
+
+        if _has_modules_to_save:
+            if not hasattr(model, "modules_to_save"):
+                model.modules_to_save = set(peft_config.modules_to_save)
+            else:
+                model.modules_to_save.update(set(peft_config.modules_to_save))
+
+    def merge_adapter(self, adapter_names: Optional[list[str]] = None) -> None:
+        """
+        This method merges the adapter layers into the base model.
+
+        Merging adapters can lead to a speed up of the forward pass. A copy of the adapter weights is still kept in
+        memory, which is required to unmerge the adapters. In order to merge the adapter weights without keeping them
+        in memory, please call `merge_and_unload`.
+
+        Args:
+            safe_merge (`bool`, *optional*):
+                If `True`, the merge operation will be performed in a copy of the original weights and check for NaNs
+                before merging the weights. This is useful if you want to check if the merge operation will produce
+                NaNs. Defaults to `False`.
+            adapter_names (`list[str]`, *optional*):
+                The list of adapter names that should be merged. If `None`, all active adapters will be merged.
+                Defaults to `None`.
+        """
+        for module in self.model.modules():
+            if isinstance(module, BaseTunerLayer):
+                with onload_layer(module):
+                    module.merge(adapter_names=adapter_names)
+
+    def unmerge_adapter(self):
+        """
+        This method unmerges all merged adapter layers from the base model.
+        """
+        for module in self.model.modules():
+            if isinstance(module, BaseTunerLayer):
+                with onload_layer(module):
+                    module.unmerge()
+
+    def _unloading_checks(self, adapter_names: Optional[list[str]]):
+        adapters_to_consider = adapter_names or self.active_adapters
+        is_modules_to_save_available = any(
+            self.peft_config[adapter].modules_to_save for adapter in adapters_to_consider
+        )
+        if is_modules_to_save_available and len(adapters_to_consider) > 1:
+            raise ValueError("Cannot unload multiple adapters that specify `modules_to_save`.")
+
+
+class BaseTunerLayer(ABC):
+    r"""
+    A tuner layer mixin that provides the common methods and attributes for all tuners.
+
+    Args:
+        is_pluggable (`bool`, *optional*):
+            Whether the adapter layer can be plugged to any pytorch module
+        active_adapters (Union[List[`str`], `str`], *optional*):
+            The name of the active adapter.
+    """
+
+    active_adapter = None
+
+    # All names of layers that may contain adapter (trainable) weights
+    adapter_layer_names: tuple[str] = ()
+    # All names of other parameters that may contain adapter-related parameters
+    other_param_names: tuple[str] = ()
+
+    # indicates whether all adapters should be disabled
+    _disable_adapters: bool = False
+
+    # the currently active adapter(s)
+    _active_adapter: str | list[str] = "default"
+
+    # List all merged adapters
+    merged_adapters: list[str] = []
+
+    def get_base_layer(self) -> nn.Module:
+        """
+        (Recursively) get the base_layer.
+
+        This is necessary for the case that the tuner layer wraps another tuner layer.
+
+        """
+        base_layer = self
+        while hasattr(base_layer, "base_layer"):
+            base_layer = base_layer.base_layer
+        return base_layer
+
+    @property
+    def weight(self) -> torch.Tensor:
+        # This is required for some transformers code, e.g. for T5, weight is accessed as:
+        #     self.wo.weight
+        # where "wo" is the adapter layer.
+        # https://github.com/huggingface/transformers/blob/78f6ed6c70b29c1560780e3869a7ad4c6b3d2710/src/transformers
+        # /models/t5/modeling_t5.py#L292
+        base_layer = self.get_base_layer()
+        if hasattr(base_layer, "qweight"):
+            # QuantLinear
+            weight = base_layer.qweight
+        else:
+            # Other layers
+            weight = base_layer.weight
+        return weight
+
+    def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+        raise NotImplementedError
+
+    def unmerge(self) -> None:
+        raise NotImplementedError
+
+    @property
+    def merged(self) -> bool:
+        return bool(self.merged_adapters)
+
+    @property
+    def disable_adapters(self) -> bool:
+        # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method
+        return self._disable_adapters
+
+    @property
+    def active_adapter(self) -> str:
+        # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method
+        return self._active_adapter
+
+    @property
+    def active_adapters(self):
+        if isinstance(self.active_adapter, str):
+            return [self.active_adapter]
+        # is already a list of str
+        return self.active_adapter
+
+    def enable_adapters(self, enabled: bool) -> None:
+        """Toggle the enabling and disabling of adapters
+
+        Takes care of setting the requires_grad flag for the adapter weights.
+
+        Args:
+            enabled (bool): True to enable adapters, False to disable adapters
+        """
+        if enabled:
+            self.set_adapter(self.active_adapters)
+            self._disable_adapters = False
+        else:
+            # disable grads on all adapter layers
+            for layer_name in self.adapter_layer_names:
+                layer = getattr(self, layer_name)
+                layer.requires_grad_(False)
+            self._disable_adapters = True
+
+    def set_adapter(self, adapter_names: str | list[str]) -> None:
+        """Set the active adapter(s).
+
+        Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is
+        not desired, use the following code.
+
+        ```py
+        >>> for name, param in model_peft.named_parameters():
+        ...     if ...:  # some check on name (ex. if 'lora' in name)
+        ...         param.requires_grad = False
+        ```
+
+        Args:
+            adapter_name (`str` or `List[str]`): Name of the adapter(s) to be activated.
+        """
+        if isinstance(adapter_names, str):
+            adapter_names = [adapter_names]
+
+        # Deactivate grads on the inactive adapter and activate grads on the active adapter
+        for layer_name in self.adapter_layer_names:
+            module_dict = getattr(self, layer_name)
+            for key, layer in module_dict.items():
+                if key in adapter_names:
+                    # Note: It is possible that not a single layer is called with requires_grad_(True) here. This may
+                    # happen if a completely different adapter layer is being activated.
+                    layer.requires_grad_(True)
+                else:
+                    layer.requires_grad_(False)
+
+        self._active_adapter = adapter_names
+
+    def _all_available_adapter_names(self) -> list[str]:
+        """Return a sorted list of all available adapter names"""
+        adapter_names = set()
+        for name in self.adapter_layer_names + self.other_param_names:
+            # we check each possible attribute and if it's a dict or ModuleDict, we assume that the keys are the adapter
+            # names
+            attr = getattr(self, name)
+            if hasattr(attr, "keys"):
+                adapter_names.update(attr.keys())
+        return sorted(adapter_names)
+
+    def delete_adapter(self, adapter_name: str) -> None:
+        """
+        Delete an adapter from the layer
+
+        This should be called on all adapter layers, or else we will get an inconsistent state.
+
+        This method will also set a new active adapter if the deleted adapter was an active adapter. It is important
+        that the new adapter is chosen in a deterministic way, so that the same adapter is chosen on all layers.
+
+        Args:
+            adapter_name (`str`): The name of the adapter to delete
+
+        """
+        for attr in self.adapter_layer_names + self.other_param_names:
+            if adapter_name in getattr(self, attr):
+                del getattr(self, attr)[adapter_name]
+
+        if adapter_name in self.active_adapters:
+            # choose a new active adapter
+            active_adapters = self.active_adapters[:]
+            active_adapters.remove(adapter_name)
+            if active_adapters:
+                self.set_adapter(active_adapters)
+            else:
+                # no active adapters left, set a new default adapter
+                # here we get the list of all adapters existing adapter names and choose the first one
+                remaining_adapters = self._all_available_adapter_names()
+                if not remaining_adapters:
+                    self.set_adapter([])
+                else:
+                    new_active_adapter = remaining_adapters[0]
+                    warnings.warn(
+                        f"Adapter {adapter_name} was active which is now deleted. Setting active adapter to "
+                        f"{new_active_adapter}."
+                    )
+                    self.set_adapter(remaining_adapters[0])
+
+
+def check_target_module_exists(config, key: str) -> bool | re.Match[str] | None:
+    """A helper method to check if the passed module's key name matches any of the target modules in the adapter_config.
+
+    Args:
+        config (`LoraConfig` | `LycorisConfig`): A config to match target modules from
+        key (`str`): A key to search any matches in config
+
+    Returns:
+        `bool` | `re.Match[str]` | `None`: True of match object if key matches any target modules from config, False or
+        None if no match found
+    """
+    if isinstance(config.target_modules, str):
+        target_module_found = re.fullmatch(config.target_modules, key)
+    elif key in config.target_modules:
+        # this module is specified directly in target_modules
+        target_module_found = True
+    else:
+        target_module_found = any(key.endswith(f".{target_key}") for target_key in config.target_modules)
+
+        layer_indexes = getattr(config, "layers_to_transform", None)
+        layers_pattern = getattr(config, "layers_pattern", None)
+
+        is_using_layer_indexes = layer_indexes is not None and (
+            len(layer_indexes) != 0 if isinstance(layer_indexes, list) else True
+        )
+        if is_using_layer_indexes and target_module_found:
+            layer_index = None
+            # TODO: It's still unclear how empty layers_pattern (None, [], or "") should behave
+            # For now, empty layers_pattern means any layer pattern is ok
+            if layers_pattern is None or len(layers_pattern) == 0:
+                layer_index = re.match(r".*\.[^.]*\.(\d+)\.", key)
+            else:
+                layers_pattern = [layers_pattern] if isinstance(layers_pattern, str) else layers_pattern
+                for pattern in layers_pattern:
+                    layer_index = re.match(rf".*\.{pattern}\.(\d+)\.", key)
+                    if layer_index is not None:
+                        break
+
+            if layer_index is None:
+                target_module_found = False
+            else:
+                layer_index = int(layer_index.group(1))
+                if isinstance(layer_indexes, int):
+                    target_module_found = layer_index == layer_indexes
+                else:
+                    target_module_found = layer_index in layer_indexes
+
+    return target_module_found
+
+
+def inspect_matched_modules(tuner: BaseTuner, adapter_name: str = "default") -> dict:
+    """
+    A helper function to inspect the set of matched and unmatched modules for a PEFT model and the given adapter.
+    """
+    config = tuner.peft_config[adapter_name]
+    key_list = [key for key, _ in tuner.model.named_modules()]
+    module_dict = {"matched": [], "unmatched": []}
+    for key in key_list:
+        if tuner._check_target_module_exists(config, key):
+            module_dict["matched"].append(key)
+        else:
+            module_dict["unmatched"].append(key)
+    return module_dict
+
+
+def _maybe_include_all_linear_layers(peft_config: PeftConfig, model: nn.Module) -> PeftConfig:
+    """
+    Helper function to update `target_modules` to all linear/Conv1D layers if provided as 'all-linear'. Adapted from
+    the QLoRA repository: https://github.com/artidoro/qlora/blob/main/qlora.py
+    """
+
+    # if `target_modules` is a string, convert to lower case and check if it matches "all-linear"
+    if not (
+        isinstance(peft_config.target_modules, str)
+        and peft_config.target_modules.lower() == INCLUDE_LINEAR_LAYERS_SHORTHAND
+    ):
+        return peft_config
+
+    if not isinstance(model, PreTrainedModel):
+        raise ValueError(
+            f"Only instances of PreTrainedModel support `target_modules={INCLUDE_LINEAR_LAYERS_SHORTHAND!r}`"
+        )
+
+    linear_classes = (torch.nn.Linear, Conv1D)
+
+    linear_module_names = set()
+    for name, module in model.named_modules():
+        # match with all linear classes.
+        if isinstance(module, linear_classes):
+            names = name.rsplit(".", 1)[-1]  # get the base name
+            linear_module_names.add(names)
+
+    # ignore the last classification head for text generation models
+    output_emb = model.get_output_embeddings()
+    if output_emb is not None:
+        last_module_name = [name for name, module in model.named_modules() if module is output_emb][0]
+        linear_module_names -= {last_module_name}
+    peft_config.target_modules = linear_module_names
+    return peft_config
+
+
+def check_adapters_to_merge(module: BaseTunerLayer, adapter_names: Optional[list[str]] = None) -> list[str]:
+    """
+    Helper function to check which adapters should be merged.
+
+    Only return those adapters that are not already merged. Give a warning if some or all of the adapters are already
+    merged.
+
+    """
+    if adapter_names is None:
+        adapter_names = module.active_adapters
+
+    if module.merged:
+        merged_adapters = set(module.merged_adapters)
+        adapter_names = [name for name in adapter_names if name not in merged_adapters]
+
+        if adapter_names:
+            warnings.warn(
+                f"Already following adapters were merged {','.join(module.merged_adapters)}. "
+                f"You are now additionally merging {','.join(adapter_names)}."
+            )
+        else:
+            warnings.warn("All adapters are already merged, nothing to do.")
+
+    return adapter_names
diff --git a/MoRA/peft_mora/utils/__init__.py b/MoRA/peft_mora/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd48de9c7beb40ee0570907353c4d1c1c109717
--- /dev/null
+++ b/MoRA/peft_mora/utils/__init__.py
@@ -0,0 +1,51 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all
+
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# from .config import PeftConfig, PeftType, PromptLearningConfig, TaskType
+from .peft_types import PeftType, TaskType
+from .other import (
+    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
+    CONFIG_NAME,
+    WEIGHTS_NAME,
+    SAFETENSORS_WEIGHTS_NAME,
+    INCLUDE_LINEAR_LAYERS_SHORTHAND,
+    _set_trainable,
+    bloom_model_postprocess_past_key_value,
+    prepare_model_for_int8_training,
+    prepare_model_for_kbit_training,
+    shift_tokens_right,
+    transpose,
+    _get_batch_size,
+    _get_submodules,
+    _set_adapter,
+    _freeze_adapter,
+    ModulesToSaveWrapper,
+    _prepare_prompt_learning_config,
+    _is_valid_match,
+    infer_device,
+    get_auto_gptq_quant_linear,
+    get_quantization_config,
+    id_tensor_storage,
+    cast_mixed_precision_params,
+)
+from .save_and_load import get_peft_model_state_dict, set_peft_model_state_dict, load_peft_weights
diff --git a/MoRA/peft_mora/utils/__pycache__/__init__.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ca0a5636a270e534b70a70cba2f186d707a24471
Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/utils/__pycache__/constants.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/constants.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4da0c1fbe2e05f36e21caa508baa54ad22de457
Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/constants.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/utils/__pycache__/integrations.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/integrations.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b20211e11e77879dc6329c69d520f3ba8707c806
Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/integrations.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/utils/__pycache__/merge_utils.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/merge_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c89d7f523a44271ef34f1ffed601a763b6ac313d
Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/merge_utils.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/utils/__pycache__/other.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/other.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1d6109f8e7c347da53a2eec058ef3cef1025d5b2
Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/other.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/utils/__pycache__/peft_types.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/peft_types.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09c30b269ef49c25dcc5b664609a2b74db2d7fbb
Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/peft_types.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/utils/__pycache__/save_and_load.cpython-312.pyc b/MoRA/peft_mora/utils/__pycache__/save_and_load.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7541849e51a3b1b9f452d900aa6109f6e4b9d98
Binary files /dev/null and b/MoRA/peft_mora/utils/__pycache__/save_and_load.cpython-312.pyc differ
diff --git a/MoRA/peft_mora/utils/constants.py b/MoRA/peft_mora/utils/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..41047431cef3c18c603b4a331b791cbf00f4cdf2
--- /dev/null
+++ b/MoRA/peft_mora/utils/constants.py
@@ -0,0 +1,158 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+
+# needed for prefix-tuning of bloom model
+def bloom_model_postprocess_past_key_value(past_key_values):
+    past_key_values = torch.cat(past_key_values)
+    total_layers, batch_size, num_attention_heads, num_virtual_tokens, head_dim = past_key_values.shape
+    keys = past_key_values[: total_layers // 2]
+    keys = keys.transpose(2, 3).reshape(
+        total_layers // 2, batch_size * num_attention_heads, head_dim, num_virtual_tokens
+    )
+    values = past_key_values[total_layers // 2 :]
+    values = values.reshape(total_layers // 2, batch_size * num_attention_heads, num_virtual_tokens, head_dim)
+
+    return tuple(zip(keys, values))
+
+
+# needed for prefix-tuning of StarCoder models
+def starcoder_model_postprocess_past_key_value(past_key_values):
+    result = []
+    for k in past_key_values:
+        k = k[:, :, 0]
+        k = k.permute([1, 2, 0, 3])
+        k = k.reshape(*k.shape[:-2], -1)
+        result.append(k)
+    return tuple(result)
+
+
+TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING = {
+    "bloom": bloom_model_postprocess_past_key_value,
+    "gpt_bigcode": starcoder_model_postprocess_past_key_value,
+}
+
+
+TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
+    "t5": ["q", "v"],
+    "mt5": ["q", "v"],
+    "bart": ["q_proj", "v_proj"],
+    "gpt2": ["c_attn"],
+    "bloom": ["query_key_value"],
+    "blip-2": ["q", "v", "q_proj", "v_proj"],
+    "opt": ["q_proj", "v_proj"],
+    "gptj": ["q_proj", "v_proj"],
+    "gpt_neox": ["query_key_value"],
+    "gpt_neo": ["q_proj", "v_proj"],
+    "bert": ["query", "value"],
+    "roberta": ["query", "value"],
+    "xlm-roberta": ["query", "value"],
+    "electra": ["query", "value"],
+    "deberta-v2": ["query_proj", "value_proj"],
+    "deberta": ["in_proj"],
+    "layoutlm": ["query", "value"],
+    "llama": ["q_proj", "v_proj"],
+    "chatglm": ["query_key_value"],
+    "gpt_bigcode": ["c_attn"],
+    "mpt": ["Wqkv"],
+    "RefinedWebModel": ["query_key_value"],
+    "RefinedWeb": ["query_key_value"],
+    "falcon": ["query_key_value"],
+    "btlm": ["c_proj", "c_attn"],
+    "codegen": ["qkv_proj"],
+    "mistral": ["q_proj", "v_proj"],
+    "mixtral": ["q_proj", "v_proj"],
+    "stablelm": ["q_proj", "v_proj"],
+    "phi": ["q_proj", "v_proj", "fc1", "fc2"],
+    "gemma": ["q_proj", "v_proj"],
+}
+
+TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING = {
+    "t5": ["k", "v", "wo"],
+    "mt5": ["k", "v", "wi_1"],
+    "gpt2": ["c_attn", "mlp.c_proj"],
+    "bloom": ["query_key_value", "mlp.dense_4h_to_h"],
+    "roberta": ["key", "value", "output.dense"],
+    "opt": ["q_proj", "k_proj", "fc2"],
+    "gptj": ["q_proj", "v_proj", "fc_out"],
+    "gpt_neox": ["query_key_value", "dense_4h_to_h"],
+    "gpt_neo": ["q_proj", "v_proj", "c_proj"],
+    "bart": ["q_proj", "v_proj", "fc2"],
+    "gpt_bigcode": ["c_attn", "mlp.c_proj"],
+    "llama": ["k_proj", "v_proj", "down_proj"],
+    "mistral": ["k_proj", "v_proj", "down_proj"],
+    "mixtral": ["k_proj", "v_proj", "w2"],
+    "bert": ["key", "value", "output.dense"],
+    "deberta-v2": ["key_proj", "value_proj", "output.dense"],
+    "deberta": ["in_proj", "output.dense"],
+    "RefinedWebModel": ["query_key_value", "dense_4h_to_h"],
+    "RefinedWeb": ["query_key_value", "dense_4h_to_h"],
+    "falcon": ["query_key_value", "dense_4h_to_h"],
+    "phi": ["q_proj", "v_proj", "fc2"],
+    "gemma": ["q_proj", "v_proj", "down_proj"],
+}
+
+TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING = {
+    "t5": ["wo"],
+    "mt5": [],
+    "gpt2": ["mlp.c_proj"],
+    "bloom": ["mlp.dense_4h_to_h"],
+    "roberta": ["output.dense"],
+    "opt": ["fc2"],
+    "gptj": ["fc_out"],
+    "gpt_neox": ["dense_4h_to_h"],
+    "gpt_neo": ["c_proj"],
+    "bart": ["fc2"],
+    "gpt_bigcode": ["mlp.c_proj"],
+    "llama": ["down_proj"],
+    "mistral": ["down_proj"],
+    "mixtral": ["w2"],
+    "bert": ["output.dense"],
+    "deberta-v2": ["output.dense"],
+    "deberta": ["output.dense"],
+    "RefinedWeb": ["dense_4h_to_h"],
+    "RefinedWebModel": ["dense_4h_to_h"],
+    "falcon": ["dense_4h_to_h"],
+    "phi": ["fc2"],
+    "gemma": ["down_proj"],
+}
+
+TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING = {
+    "t5": ["q", "k", "v", "o", "wi", "wo"],
+    "mt5": ["q", "k", "v", "o", "wi_0", "wi_1", "wo"],
+    "bart": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
+    "gpt2": ["c_attn"],
+    "bloom": ["query_key_value"],
+    "opt": ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
+    "gptj": ["q_proj", "v_proj"],
+    "gpt_neox": ["query_key_value"],
+    "gpt_neo": ["q_proj", "v_proj"],
+    "llama": ["q_proj", "v_proj"],
+    "bert": ["query", "value"],
+    "roberta": ["query", "key", "value", "dense"],
+    # "xlm-roberta": ["query", "value"],
+    # "electra": ["query", "value"],
+    "deberta-v2": ["query_proj", "key_proj", "value_proj", "dense"],
+    "gpt_bigcode": ["c_attn"],
+    "deberta": ["in_proj"],
+    # "layoutlm": ["query", "value"],
+}
+
+WEIGHTS_NAME = "adapter_model.bin"
+SAFETENSORS_WEIGHTS_NAME = "adapter_model.safetensors"
+CONFIG_NAME = "adapter_config.json"
+EMBEDDING_LAYER_NAMES = ["embed_tokens", "lm_head"]
+INCLUDE_LINEAR_LAYERS_SHORTHAND = "all-linear"
+TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
diff --git a/MoRA/peft_mora/utils/integrations.py b/MoRA/peft_mora/utils/integrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b0e139483ec12632a4183441628502e7122f179
--- /dev/null
+++ b/MoRA/peft_mora/utils/integrations.py
@@ -0,0 +1,39 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import contextmanager
+
+import packaging.version
+import torch
+import transformers
+
+
+@contextmanager
+def gather_params_ctx(module: torch.nn.Module, modifier_rank: int = 0):
+    """Call DeepSpeed GatheredParameters context manager if DeepSpeed is enabled, otherwise do nothing."""
+    if packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.33.0"):
+        from transformers.integrations import is_deepspeed_zero3_enabled
+    else:
+        from transformers.deepspeed import is_deepspeed_zero3_enabled
+
+    if not is_deepspeed_zero3_enabled():
+        yield
+        return
+
+    import deepspeed
+
+    params_to_gather = module.parameters()
+    with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=modifier_rank):
+        yield
+    return
diff --git a/MoRA/peft_mora/utils/loftq_utils.py b/MoRA/peft_mora/utils/loftq_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f83417f3ca350f198b8d759cfc2231ee6b56c00
--- /dev/null
+++ b/MoRA/peft_mora/utils/loftq_utils.py
@@ -0,0 +1,229 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reference code: https://github.com/yxli2123/LoftQ/blob/main/utils.py
+# Reference paper: https://arxiv.org/abs/2310.08659
+
+import logging
+from typing import Union
+
+import torch
+
+from peft_mora.import_utils import is_bnb_4bit_available, is_bnb_available
+
+
+if is_bnb_available():
+    import bitsandbytes as bnb
+
+
+class NFQuantizer:
+    def __init__(self, num_bits=2, device="cuda", method="normal", block_size=64, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_bits = num_bits
+        self.device = device
+        self.method = method
+        self.block_size = block_size
+        if self.method == "normal":
+            self.norm_lookup_table = self.create_normal_map(num_bits=self.num_bits)
+            self.norm_lookup_table = self.norm_lookup_table.to(device)
+        elif self.method == "uniform":
+            self.norm_lookup_table = self.create_uniform_map(num_bits=self.num_bits)
+            self.norm_lookup_table = self.norm_lookup_table.to(device)
+        else:
+            raise NotImplementedError("Other quantization methods not supported yet.")
+
+    @staticmethod
+    def create_uniform_map(symmetric=False, num_bits=4):
+        if symmetric:
+            # print("symmetric uniform quantization")
+            negative = torch.linspace(-1, 0, 2 ** (num_bits - 1))
+            positive = torch.linspace(0, 1, 2 ** (num_bits - 1))
+            table = torch.cat([negative, positive[1:]])
+        else:
+            # print("asymmetric uniform quantization")
+            table = torch.linspace(-1, 1, 2**num_bits)
+        return table
+
+    @staticmethod
+    def create_normal_map(offset=0.9677083, symmetric=False, num_bits=2):
+        try:
+            from scipy.stats import norm
+        except ImportError:
+            raise ImportError("The required package 'scipy' is not installed. Please install it to continue.")
+
+        variations = 2**num_bits
+        if symmetric:
+            v = norm.ppf(torch.linspace(1 - offset, offset, variations + 1)).tolist()
+            values = []
+            for index in range(len(v) - 1):
+                values.append(0.5 * v[index] + 0.5 * v[index + 1])
+            v = values
+        else:
+            # one more positive value, this is an asymmetric type
+            v1 = norm.ppf(torch.linspace(offset, 0.5, variations // 2 + 1)[:-1]).tolist()
+            v2 = [0]
+            v3 = (-norm.ppf(torch.linspace(offset, 0.5, variations // 2)[:-1])).tolist()
+            v = v1 + v2 + v3
+
+        values = torch.Tensor(v)
+        values = values.sort().values
+        values /= values.max()
+        return values
+
+    def quantize_tensor(self, weight):
+        max_abs = torch.abs(weight).max()
+        weight_normed = weight / max_abs
+
+        weight_normed_expanded = weight_normed.unsqueeze(-1)
+
+        # Reshape L to have the same number of dimensions as X_expanded
+        L_reshaped = torch.tensor(self.norm_lookup_table).reshape(1, -1)
+
+        # Calculate the absolute difference between X_expanded and L_reshaped
+        abs_diff = torch.abs(weight_normed_expanded - L_reshaped)
+
+        # Find the index of the minimum absolute difference for each element
+        qweight = torch.argmin(abs_diff, dim=-1)
+        return qweight, max_abs
+
+    def dequantize_tensor(self, qweight, max_abs):
+        qweight_flatten = qweight.flatten()
+
+        weight_normed = self.norm_lookup_table[qweight_flatten]
+        weight = weight_normed * max_abs
+
+        weight = weight.reshape(qweight.shape)
+
+        return weight
+
+    def quantize_block(self, weight):
+        if len(weight.shape) != 2:
+            raise ValueError(f"Only support 2D matrix, but your input has {len(weight.shape)} dimensions.")
+        if weight.shape[0] * weight.shape[1] % self.block_size != 0:
+            raise ValueError(
+                f"Weight with shape ({weight.shape[0]} x {weight.shape[1]}) "
+                f"is not dividable by block size {self.block_size}."
+            )
+
+        M, N = weight.shape
+        device = weight.device
+
+        # Quantization
+        weight_flatten = weight.flatten()  # (M*N, )
+        weight_block = weight_flatten.reshape(-1, self.block_size)  # (L, B), L = M * N / B
+        if self.method == "normal":
+            weight_max = weight_block.abs().max(dim=-1)[0]  # (L, 1)
+        elif self.method == "uniform":
+            weight_max = weight_block.mean(dim=-1) + 2.5 * weight_block.std(dim=-1)
+        else:
+            raise NotImplementedError("Method not supported yet.")
+        weight_max = weight_max.unsqueeze(-1)
+        weight_divabs = weight_block / weight_max  # (L, B)
+        weight_divabs = weight_divabs.unsqueeze(-1)  # (L, B, 1)
+        L_reshaped = self.norm_lookup_table.reshape(1, -1)  # (1, 2**K)
+
+        abs_diff = torch.abs(weight_divabs - L_reshaped)  # (L, B, 2**K)
+        qweight = torch.argmin(abs_diff, dim=-1)  # (L, B)
+
+        # Pack multiple k-bit into uint8
+        qweight = qweight.reshape(-1, 8 // self.num_bits)
+        qweight_pack = torch.zeros((M * N // 8 * self.num_bits, 1), dtype=torch.uint8, device=device)
+
+        # data format example:
+        # [1, 0, 3, 2] or [01, 00, 11, 10]  -> [10110001], LIFO
+        for i in range(8 // self.num_bits):
+            qweight[:, i] = qweight[:, i] << i * self.num_bits
+            qweight_pack[:, 0] |= qweight[:, i]
+
+        return qweight_pack, weight_max, weight.shape
+
+    def dequantize_block(self, qweight, weight_max, weight_shape):
+        # unpack weight
+        device = qweight.device
+        weight = torch.zeros((qweight.shape[0], 8 // self.num_bits), dtype=torch.float32, device=device)
+        for i in range(8 // self.num_bits):
+            lookup_table_idx = qweight.to(torch.long) % 2**self.num_bits  # get the most right 2 bits
+            lookup_table_idx = lookup_table_idx.to(torch.long)
+            weight[:, i] = self.norm_lookup_table[lookup_table_idx].squeeze()
+            qweight = qweight >> self.num_bits  # right shift 2 bits of the original data
+
+        weight_block = weight.reshape(-1, self.block_size)
+        weight = weight_block * weight_max
+        weight = weight.reshape(weight_shape)
+
+        return weight
+
+
+def _low_rank_decomposition(weight, reduced_rank=32):
+    """
+    :param weight: The matrix to decompose, of shape (H, W) :param reduced_rank: the final rank :return:
+    """
+    matrix_dimension = len(weight.size())
+    if matrix_dimension != 2:
+        raise ValueError(f"Only support 2D matrix, but your input has {matrix_dimension} dimensions.")
+
+    # Use SVD to decompose a matrix, default full_matrices is False to save parameters
+    U, S, Vh = torch.linalg.svd(weight, full_matrices=False)
+
+    L = U @ (torch.sqrt(torch.diag(S)[:, 0:reduced_rank]))
+    R = torch.sqrt(torch.diag(S)[0:reduced_rank, :]) @ Vh
+
+    return {"L": L, "R": R, "U": U, "S": S, "Vh": Vh, "reduced_rank": reduced_rank}
+
+
+@torch.no_grad()
+def loftq_init(weight: Union[torch.Tensor, torch.nn.Parameter], num_bits: int, reduced_rank: int, num_iter=1):
+    if num_bits not in [2, 4, 8]:
+        raise ValueError("Only support 2, 4, 8 bits quantization")
+    if num_iter <= 0:
+        raise ValueError("Number of iterations must be greater than 0")
+
+    out_feature, in_feature = weight.size()
+    device = weight.device
+    dtype = weight.dtype
+
+    logging.info(
+        f"Weight: ({out_feature}, {in_feature}) | Rank: {reduced_rank} "
+        f"| Num Iter: {num_iter} | Num Bits: {num_bits}"
+    )
+    if not is_bnb_4bit_available() or num_bits in [2, 8]:
+        quantizer = NFQuantizer(num_bits=num_bits, device=device, method="normal", block_size=64)
+        compute_device = device
+    else:
+        compute_device = "cuda"
+
+    weight = weight.to(device=compute_device, dtype=torch.float32)
+    res = weight.clone()
+    for i in range(num_iter):
+        torch.cuda.empty_cache()
+        # Quantization
+        if num_bits == 4 and is_bnb_4bit_available():
+            qweight = bnb.nn.Params4bit(
+                res.to("cpu"), requires_grad=False, compress_statistics=False, quant_type="nf4"
+            ).to(compute_device)
+            dequantized_weight = bnb.functional.dequantize_4bit(qweight.data, qweight.quant_state)
+        else:
+            quantized_weight, max_abs, shape = quantizer.quantize_block(res)
+            dequantized_weight = quantizer.dequantize_block(quantized_weight, max_abs, shape)
+
+        res = weight - dequantized_weight
+
+        # Decompose the residual by SVD
+        output = _low_rank_decomposition(res, reduced_rank=reduced_rank)
+        L, R, reduced_rank = output["L"], output["R"], output["reduced_rank"]
+        res = weight - torch.mm(L, R)
+
+    lora_A, lora_B = R, L
+
+    return dequantized_weight.to(device=device, dtype=dtype), lora_A, lora_B
diff --git a/MoRA/peft_mora/utils/merge_utils.py b/MoRA/peft_mora/utils/merge_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..900cb878e2d785e5e800caa58642f0532f4d574b
--- /dev/null
+++ b/MoRA/peft_mora/utils/merge_utils.py
@@ -0,0 +1,268 @@
+# Copyright 2024-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import List, Literal
+
+import torch
+
+
+def reshape_weight_task_tensors(task_tensors, weights):
+    """
+    Reshapes `weights` to match the shape of `task_tensors` by unsqeezing in the remaining dimenions.
+
+    Args:
+        task_tensors (`torch.Tensor`): The tensors that will be used to reshape `weights`.
+        weights (`torch.Tensor`): The tensor to be reshaped.
+
+    Returns:
+        `torch.Tensor`: The reshaped tensor.
+    """
+    new_shape = weights.shape + (1,) * (task_tensors.dim() - weights.dim())
+    weights = weights.view(new_shape)
+    return weights
+
+
+def magnitude_based_pruning(tensor: torch.Tensor, density: float) -> torch.Tensor:
+    """
+    Prune the smallest values of the task tensors and retain the top-k values based on the specified fraction
+    `density`.
+
+    Args:
+        tensor (`torch.Tensor`):The tensor to prune.
+        density (`float`):The fraction of values to preserve. Should be in [0,1].
+
+    Returns:
+        `torch.Tensor`: The tensor with the pruned weights.
+    """
+    mask = torch.zeros_like(tensor).reshape(-1)
+    k = int(density * tensor.numel())
+    top_k = torch.topk(tensor.abs().reshape(-1), k=k, largest=True)
+    mask[top_k[1]] = 1
+    return tensor * mask.reshape(tensor.shape)
+
+
+def random_pruning(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor:
+    """
+    Prune random values based on the specified fraction `density`.
+
+    Args:
+        tensor (`torch.Tensor`):The tensor to prune.
+        density (`float`):The fraction of values to preserve. Should be in [0,1].
+        rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
+
+    Returns:
+        `torch.Tensor`: The pruned tensor.
+    """
+    mask = torch.bernoulli(torch.full_like(input=tensor, fill_value=density))
+    pruned_tensor = tensor * mask
+    if rescale:
+        torch.div(input=pruned_tensor, other=density)
+    return pruned_tensor
+
+
+def prune(
+    tensor: torch.Tensor, density: float, method: Literal["magnitude", "random"], rescale: bool = False
+) -> torch.Tensor:
+    """
+    Prune the values of task tensors based on the `method`.
+
+    Args:
+        tensor (`torch.Tensor`):The tensor to prune.
+        density (`float`):The fraction of values to preserve. Should be in [0,1].
+        method (`str`):The method to use to prune. Should be one of ["magnitude", "random"].
+        rescale (`bool`):Whether to rescale the result to preserve the expected value of the original tensor.
+
+    Returns:
+        `torch.Tensor`: The pruned tensor.
+    """
+    if density >= 1:
+        warnings.warn(f"The density {density} is greater than or equal to 1, no pruning will be performed.")
+        return tensor
+    elif density < 0:
+        raise ValueError(f"Density should be >= 0, got {density}")
+    if method == "magnitude":
+        return magnitude_based_pruning(tensor, density)
+    elif method == "random":
+        return random_pruning(tensor, density, rescale=rescale)
+    else:
+        raise ValueError(f"Unknown method {method}")
+
+
+def calculate_majority_sign_mask(
+    tensor: torch.Tensor, method: Literal["total", "frequency"] = "total"
+) -> torch.Tensor:
+    """
+    Get the mask of the majority sign across the task tensors. Task tensors are stacked on dimension 0.
+
+    Args:
+        tensor (`torch.Tensor`):The tensor to get the mask from.
+        method (`str`):The method to use to get the mask. Should be one of ["total", "frequency"].
+
+    Returns:
+        `torch.Tensor`: The majority sign mask.
+    """
+
+    sign = tensor.sign()
+    if method == "total":
+        sign_magnitude = tensor.sum(dim=0)
+    elif method == "frequency":
+        sign_magnitude = sign.sum(dim=0)
+    else:
+        raise RuntimeError(f'Unimplemented mask method "{method}"')
+    majority_sign = torch.where(sign_magnitude >= 0, 1, -1)
+    return sign == majority_sign
+
+
+def disjoint_merge(task_tensors: torch.Tensor, majority_sign_mask: torch.Tensor) -> torch.Tensor:
+    """
+    Merge the task tensors using disjoint merge.
+
+    Args:
+        task_tensors (`torch.Tensor`):The task tensors to merge.
+        majority_sign_mask (`torch.Tensor`):The mask of the majority sign across the task tensors.
+
+    Returns:
+        `torch.Tensor`: The merged tensor.
+    """
+    mixed_task_tensors = (task_tensors * majority_sign_mask).sum(dim=0)
+    num_params_preserved = majority_sign_mask.sum(dim=0)
+    return mixed_task_tensors / torch.clamp(num_params_preserved, min=1.0)
+
+
+def task_arithmetic(task_tensors: List[torch.Tensor], weights: torch.Tensor) -> torch.Tensor:
+    """
+    Merge the task tensors using `task arithmetic`.
+
+    Args:
+        task_tensors(`List[torch.Tensor]`):The task tensors to merge.
+        weights (`torch.Tensor`):The weights of the task tensors.
+
+    Returns:
+        `torch.Tensor`: The merged tensor.
+    """
+    task_tensors = torch.stack(task_tensors, dim=0)
+    # weighted task tensors
+    weights = reshape_weight_task_tensors(task_tensors, weights)
+    weighted_task_tensors = task_tensors * weights
+    mixed_task_tensors = weighted_task_tensors.sum(dim=0)
+    return mixed_task_tensors
+
+
+def magnitude_prune(task_tensors: List[torch.Tensor], weights: torch.Tensor, density: float) -> torch.Tensor:
+    """
+    Merge the task tensors using `task arithmetic`.
+
+    Args:
+        task_tensors(`List[torch.Tensor]`):The task tensors to merge.
+        weights (`torch.Tensor`):The weights of the task tensors.
+        density (`float`): The fraction of values to preserve. Should be in [0,1].
+
+    Returns:
+        `torch.Tensor`: The merged tensor.
+    """
+    # sparsify
+    task_tensors = [prune(tensor, density, method="magnitude") for tensor in task_tensors]
+    task_tensors = torch.stack(task_tensors, dim=0)
+    # weighted task tensors
+    weights = reshape_weight_task_tensors(task_tensors, weights)
+    weighted_task_tensors = task_tensors * weights
+    mixed_task_tensors = weighted_task_tensors.sum(dim=0)
+    return mixed_task_tensors
+
+
+def ties(
+    task_tensors: List[torch.Tensor],
+    weights: torch.Tensor,
+    density: float,
+    majority_sign_method: Literal["total", "frequency"] = "total",
+) -> torch.Tensor:
+    """
+    Merge the task tensors using `ties`.
+
+    Args:
+        task_tensors(`List[torch.Tensor]`):The task tensors to merge.
+        weights (`torch.Tensor`):The weights of the task tensors.
+        density (`float`):The fraction of values to preserve. Should be in [0,1].
+        majority_sign_method (`str`):
+            The method to use to get the majority sign mask. Should be one of ["total", "frequency"].
+
+    Returns:
+        `torch.Tensor`: The merged tensor.
+    """
+    # sparsify
+    task_tensors = [prune(tensor, density, method="magnitude") for tensor in task_tensors]
+    task_tensors = torch.stack(task_tensors, dim=0)
+    # Elect Sign
+    majority_sign_mask = calculate_majority_sign_mask(task_tensors, method=majority_sign_method)
+    # weighted task tensors
+    weights = reshape_weight_task_tensors(task_tensors, weights)
+    weighted_task_tensors = task_tensors * weights
+    # Disjoint Merge
+    mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask)
+    return mixed_task_tensors
+
+
+def dare_linear(task_tensors: List[torch.Tensor], weights: torch.Tensor, density: float) -> torch.Tensor:
+    """
+    Merge the task tensors using `dare linear`.
+
+    Args:
+        task_tensors(`List[torch.Tensor]`):The task tensors to merge.
+        weights (`torch.Tensor`):The weights of the task tensors.
+        density (`float`):The fraction of values to preserve. Should be in [0,1].
+
+    Returns:
+        `torch.Tensor`: The merged tensor.
+    """
+    # sparsify
+    task_tensors = [prune(tensor, density, method="random", rescale=True) for tensor in task_tensors]
+    task_tensors = torch.stack(task_tensors, dim=0)
+    # weighted task tensors
+    weights = reshape_weight_task_tensors(task_tensors, weights)
+    weighted_task_tensors = task_tensors * weights
+    mixed_task_tensors = weighted_task_tensors.sum(dim=0)
+    return mixed_task_tensors
+
+
+def dare_ties(
+    task_tensors: List[torch.Tensor],
+    weights: torch.Tensor,
+    density: float,
+    majority_sign_method: Literal["total", "frequency"] = "total",
+) -> torch.Tensor:
+    """
+    Merge the task tensors using `dare ties`.
+
+    Args:
+        task_tensors(`List[torch.Tensor]`):The task tensors to merge.
+        weights (`torch.Tensor`):The weights of the task tensors.
+        density (`float`):The fraction of values to preserve. Should be in [0,1].
+        majority_sign_method (`str`):
+            The method to use to get the majority sign mask. Should be one of ["total", "frequency"].
+
+    Returns:
+        `torch.Tensor`: The merged tensor.
+    """
+    # sparsify
+    task_tensors = [prune(tensor, density, method="random", rescale=True) for tensor in task_tensors]
+    task_tensors = torch.stack(task_tensors, dim=0)
+    # Elect Sign
+    majority_sign_mask = calculate_majority_sign_mask(task_tensors, method=majority_sign_method)
+    # weighted task tensors
+    weights = reshape_weight_task_tensors(task_tensors, weights)
+    weighted_task_tensors = task_tensors * weights
+    # Disjoint Merge
+    mixed_task_tensors = disjoint_merge(weighted_task_tensors, majority_sign_mask)
+    return mixed_task_tensors
diff --git a/MoRA/peft_mora/utils/other.py b/MoRA/peft_mora/utils/other.py
new file mode 100644
index 0000000000000000000000000000000000000000..71ad4848c0e9d037f3ebc02c1e3fccd1cd30958c
--- /dev/null
+++ b/MoRA/peft_mora/utils/other.py
@@ -0,0 +1,593 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import inspect
+import os
+import warnings
+from contextlib import nullcontext
+from typing import Optional, Tuple
+
+import accelerate
+import torch
+from accelerate.hooks import add_hook_to_module, remove_hook_from_module
+from accelerate.utils import is_npu_available, is_xpu_available
+from huggingface_hub import file_exists
+from huggingface_hub.utils import EntryNotFoundError, HFValidationError
+from safetensors.torch import storage_ptr, storage_size
+
+from ..import_utils import is_auto_gptq_available, is_torch_tpu_available
+from .constants import (
+    CONFIG_NAME,
+    EMBEDDING_LAYER_NAMES,
+    INCLUDE_LINEAR_LAYERS_SHORTHAND,
+    SAFETENSORS_WEIGHTS_NAME,
+    TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING,
+    TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING,
+    WEIGHTS_NAME,
+    bloom_model_postprocess_past_key_value,
+    starcoder_model_postprocess_past_key_value,
+)
+
+
+__all__ = [
+    "CONFIG_NAME",
+    "EMBEDDING_LAYER_NAMES",
+    "SAFETENSORS_WEIGHTS_NAME",
+    "TRANSFORMERS_MODELS_TO_ADALORA_TARGET_MODULES_MAPPING",
+    "TRANSFORMERS_MODELS_TO_IA3_FEEDFORWARD_MODULES_MAPPING",
+    "TRANSFORMERS_MODELS_TO_IA3_TARGET_MODULES_MAPPING",
+    "TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING",
+    "TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING",
+    "WEIGHTS_NAME",
+    "INCLUDE_LINEAR_LAYERS_SHORTHAND",
+    "bloom_model_postprocess_past_key_value",
+    "starcoder_model_postprocess_past_key_value",
+]
+
+
+# Get current device name based on available devices
+def infer_device() -> str:
+    if torch.cuda.is_available():
+        return "cuda"
+    elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+        return "mps"
+    elif is_xpu_available():
+        return "xpu"
+    elif is_npu_available():
+        return "npu"
+    return "cpu"
+
+
+def prepare_model_for_kbit_training(model, use_gradient_checkpointing=True, gradient_checkpointing_kwargs=None):
+    r"""
+    Note this method only works for `transformers` models.
+
+    This method wraps the entire protocol for preparing a model before running a training. This includes:
+        1- Cast the layernorm in fp32 2- making output embedding layer require grads 3- Add the upcasting of the lm
+        head to fp32
+
+    Args:
+        model (`transformers.PreTrainedModel`):
+            The loaded model from `transformers`
+        use_gradient_checkpointing (`bool`, *optional*, defaults to `True`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`):
+            Keyword arguments to pass to the gradient checkpointing function, please refer to the documentation of
+            `torch.utils.checkpoint.checkpoint` for more details about the arguments that you can pass to that method.
+            Note this is only available in the latest transformers versions (> 4.34.1).
+    """
+    loaded_in_kbit = getattr(model, "is_loaded_in_8bit", False) or getattr(model, "is_loaded_in_4bit", False)
+    is_gptq_quantized = getattr(model, "quantization_method", None) == "gptq"
+    is_aqlm_quantized = getattr(model, "quantization_method", None) == "aqlm"
+    if gradient_checkpointing_kwargs is None:
+        gradient_checkpointing_kwargs = {}
+
+    for name, param in model.named_parameters():
+        # freeze base model's layers
+        param.requires_grad = False
+
+    if not is_gptq_quantized and not is_aqlm_quantized:
+        # cast all non INT8 parameters to fp32
+        for param in model.parameters():
+            if (param.dtype == torch.float16) or (param.dtype == torch.bfloat16):
+                param.data = param.data.to(torch.float32)
+
+    if (loaded_in_kbit or is_gptq_quantized or is_aqlm_quantized) and use_gradient_checkpointing:
+        # When having `use_reentrant=False` + gradient_checkpointing, there is no need for this hack
+        if "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]:
+            # For backward compatibility
+            if hasattr(model, "enable_input_require_grads"):
+                model.enable_input_require_grads()
+            else:
+
+                def make_inputs_require_grad(module, input, output):
+                    output.requires_grad_(True)
+
+                model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+        # To support older transformers versions, check if the model supports gradient_checkpointing_kwargs
+        _supports_gc_kwargs = "gradient_checkpointing_kwargs" in list(
+            inspect.signature(model.gradient_checkpointing_enable).parameters
+        )
+
+        if not _supports_gc_kwargs and len(gradient_checkpointing_kwargs) > 0:
+            warnings.warn(
+                "gradient_checkpointing_kwargs is not supported in this version of transformers. The passed kwargs will be ignored."
+                " if you want to use that feature, please upgrade to the latest version of transformers.",
+                FutureWarning,
+            )
+
+        gc_enable_kwargs = (
+            {} if not _supports_gc_kwargs else {"gradient_checkpointing_kwargs": gradient_checkpointing_kwargs}
+        )
+
+        # enable gradient checkpointing for memory efficiency
+        model.gradient_checkpointing_enable(**gc_enable_kwargs)
+    return model
+
+
+# For backward compatibility
+def prepare_model_for_int8_training(*args, **kwargs):
+    warnings.warn(
+        "prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.",
+        FutureWarning,
+    )
+    return prepare_model_for_kbit_training(*args, **kwargs)
+
+
+# copied from transformers.models.bart.modeling_bart
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): input ids
+        pad_token_id (`int`): The id of the `padding` token.
+        decoder_start_token_id (`int`): The id of the `start` token.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+class ModulesToSaveWrapper(torch.nn.Module):
+    def __init__(self, module_to_save, adapter_name):
+        super().__init__()
+        self.original_module = module_to_save
+        self.modules_to_save = torch.nn.ModuleDict({})
+        self._active_adapter = adapter_name
+        self._disable_adapters = False
+        self.update(adapter_name)
+        self.check_module()
+
+    def check_module(self):
+        """Perform some sanity checks on the module to ensure that it works"""
+        # Try to anticipate some modules that users could try to target that would not work.
+        # Note: It's not possible to check hasattr(module, "forward"), since that returns True for ModuleDict and
+        # ModuleList, even though their forward methods cannot be called
+        forbidden_classes = (torch.nn.ModuleDict, torch.nn.ModuleList, torch.nn.ParameterDict, torch.nn.ParameterList)
+        if isinstance(self.original_module, forbidden_classes):
+            cls_name = self.original_module.__class__.__name__
+            raise TypeError(f"modules_to_save cannot be applied to modules of type {cls_name}")
+
+    @property
+    def disable_adapters(self) -> bool:
+        # use a property to ensure that disable_adapters is not set directly, instead use the enable_adapters method
+        return self._disable_adapters
+
+    @property
+    def active_adapter(self) -> str:
+        # use a property to ensure that active_adapter is not set directly, instead use the set_adapter method
+        return self._active_adapter
+
+    @property
+    def weight(self):
+        if self.active_adapter not in self.modules_to_save:
+            return self.original_module.weight
+        return self.modules_to_save[self.active_adapter].weight
+
+    def update(self, adapter_name):
+        context_manager = nullcontext()
+        for _, param in self.original_module.named_parameters():
+            num_params = param.numel()
+            # if using DS Zero 3 and the weights are initialized empty
+            if num_params == 0 and hasattr(param, "ds_numel"):
+                import deepspeed
+
+                context_manager = deepspeed.zero.GatheredParameters(self.original_module.parameters(), modifier_rank=0)
+                break
+        with context_manager:
+            self.modules_to_save.update(torch.nn.ModuleDict({adapter_name: copy.deepcopy(self.original_module)}))
+
+        if hasattr(self.modules_to_save[adapter_name], "_hf_hook"):
+            old_hook = self.modules_to_save[adapter_name]._hf_hook
+            new_hook = self._create_new_hook(old_hook)
+            remove_hook_from_module(self.modules_to_save[adapter_name])
+            add_hook_to_module(self.modules_to_save[adapter_name], new_hook)
+
+        self.original_module.requires_grad_(False)
+        if adapter_name == self.active_adapter:
+            self.modules_to_save[adapter_name].requires_grad_(True)
+
+    def _create_new_hook(self, old_hook):
+        r"""
+        Creates a new hook based on the old hook. Use it only if you know what you are doing !
+        """
+        old_hook_cls = getattr(accelerate.hooks, old_hook.__class__.__name__)
+        old_hook_attr = old_hook.__dict__
+        filtered_old_hook_attr = {}
+        old_hook_init_signature = inspect.signature(old_hook_cls.__init__)
+        for k in old_hook_attr.keys():
+            if k in old_hook_init_signature.parameters:
+                filtered_old_hook_attr[k] = old_hook_attr[k]
+        new_hook = old_hook_cls(**filtered_old_hook_attr)
+        return new_hook
+
+    def forward(self, *args, **kwargs):
+        if self.disable_adapters or (self.active_adapter not in self.modules_to_save):
+            return self.original_module(*args, **kwargs)
+        return self.modules_to_save[self.active_adapter](*args, **kwargs)
+
+    def enable_adapters(self, enabled: bool):
+        """Toggle the enabling and disabling of adapters
+
+        Takes care of setting the requires_grad flag for the adapter weights.
+
+        Args:
+            enabled (bool): True to enable adapters, False to disable adapters
+        """
+        if self._disable_adapters is not enabled:
+            # already in the desired state, do nothing
+            return
+
+        if enabled:
+            self.original_module.requires_grad_(False)
+            self.modules_to_save[self.active_adapter].requires_grad_(True)
+            self._disable_adapters = False
+        else:
+            self.original_module.requires_grad_(True)
+            self.modules_to_save.requires_grad_(False)
+            self._disable_adapters = True
+
+    def set_adapter(self, adapter_name: str):
+        """Set the active adapter
+
+        Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is
+        not desired, use the following code.
+
+        ```py
+        >>> for name, param in model_peft.named_parameters():
+        ...     if ...:  # some check on name (ex. if 'lora' in name)
+        ...         param.requires_grad = False
+        ```
+
+        Args:
+            adapter_name (str): The name of the adapter to set as active
+        """
+        if adapter_name not in self.modules_to_save:
+            raise ValueError(f"Adapter {adapter_name} not found in {self.modules_to_save.keys()}")
+
+        self.modules_to_save[self.active_adapter].requires_grad_(False)
+        self.modules_to_save[adapter_name].requires_grad_(True)
+        self._active_adapter = adapter_name
+
+
+def _get_submodules(model, key):
+    parent = model.get_submodule(".".join(key.split(".")[:-1]))
+    target_name = key.split(".")[-1]
+    target = model.get_submodule(key)
+    return parent, target, target_name
+
+
+def _freeze_adapter(model, adapter_name):
+    for n, p in model.named_parameters():
+        if adapter_name in n:
+            p.requires_grad = False
+
+
+def _set_trainable(model, adapter_name):
+    key_list = [key for key, _ in model.named_modules()]
+    for key in key_list:
+        target_module_found = any(key.endswith(target_key) for target_key in model.modules_to_save)
+        if target_module_found:
+            parent, target, target_name = _get_submodules(model, key)
+            if isinstance(target, ModulesToSaveWrapper):
+                target.update(adapter_name)
+                target.set_adapter(target.active_adapter)
+            else:
+                new_module = ModulesToSaveWrapper(target, adapter_name)
+                new_module.set_adapter(adapter_name)
+                setattr(parent, target_name, new_module)
+
+
+def _set_adapter(model, adapter_name):
+    def check_adapter_name(adapter_name):
+        if isinstance(adapter_name, str):
+            return adapter_name
+
+        # adapter_name is a list of str
+        if len(adapter_name) > 1:
+            raise ValueError("Only one adapter can be set at a time for modules_to_save")
+        elif len(adapter_name) == 0:
+            raise ValueError("Please specify at least one adapter to set")
+        adapter_name = adapter_name[0]
+        return adapter_name
+
+    for module in model.modules():
+        if isinstance(module, ModulesToSaveWrapper):
+            # only check the adapter_name if we actually encounter a ModulesToSaveWrapper, otherwise we don't care
+            adapter_name = check_adapter_name(adapter_name)
+            module.set_adapter(adapter_name)
+
+
+def _prepare_prompt_learning_config(peft_config, model_config):
+    if peft_config.num_layers is None:
+        if "num_hidden_layers" in model_config:
+            num_layers = model_config["num_hidden_layers"]
+        elif "num_layers" in model_config:
+            num_layers = model_config["num_layers"]
+        elif "n_layer" in model_config:
+            num_layers = model_config["n_layer"]
+        else:
+            raise ValueError("Please specify `num_layers` in `peft_config`")
+        peft_config.num_layers = num_layers
+
+    if peft_config.token_dim is None:
+        if "hidden_size" in model_config:
+            token_dim = model_config["hidden_size"]
+        elif "n_embd" in model_config:
+            token_dim = model_config["n_embd"]
+        elif "d_model" in model_config:
+            token_dim = model_config["d_model"]
+        else:
+            raise ValueError("Please specify `token_dim` in `peft_config`")
+        peft_config.token_dim = token_dim
+
+    if peft_config.num_attention_heads is None:
+        if "num_attention_heads" in model_config:
+            num_attention_heads = model_config["num_attention_heads"]
+        elif "n_head" in model_config:
+            num_attention_heads = model_config["n_head"]
+        elif "num_heads" in model_config:
+            num_attention_heads = model_config["num_heads"]
+        elif "encoder_attention_heads" in model_config:
+            num_attention_heads = model_config["encoder_attention_heads"]
+        else:
+            raise ValueError("Please specify `num_attention_heads` in `peft_config`")
+        peft_config.num_attention_heads = num_attention_heads
+
+    if getattr(peft_config, "encoder_hidden_size", None) is None:
+        setattr(peft_config, "encoder_hidden_size", peft_config.token_dim)
+
+    return peft_config
+
+
+def fsdp_auto_wrap_policy(model):
+    import functools
+    import os
+
+    from accelerate import FullyShardedDataParallelPlugin
+    from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
+
+    from ..tuners import PrefixEncoder, PromptEmbedding, PromptEncoder
+
+    default_transformer_cls_names_to_wrap = (
+        ",".join(model._no_split_modules) if getattr(model, "_no_split_modules", None) is not None else ""
+    )
+    transformer_cls_names_to_wrap = os.environ.get(
+        "FSDP_TRANSFORMER_CLS_TO_WRAP", default_transformer_cls_names_to_wrap
+    ).split(",")
+    transformer_cls_to_wrap = {PrefixEncoder, PromptEncoder, PromptEmbedding}
+    for layer_class in transformer_cls_names_to_wrap:
+        transformer_cls = FullyShardedDataParallelPlugin.get_module_class_from_name(model, layer_class)
+        if transformer_cls is None:
+            raise Exception("Could not find the transformer layer class to wrap in the model.")
+        else:
+            transformer_cls_to_wrap.add(transformer_cls)
+
+    def lambda_policy_fn(module):
+        if (
+            len(list(module.named_children())) == 0
+            and getattr(module, "weight", None) is not None
+            and module.weight.requires_grad
+        ):
+            return True
+        return False
+
+    lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
+    transformer_wrap_policy = functools.partial(
+        transformer_auto_wrap_policy,
+        transformer_layer_cls=transformer_cls_to_wrap,
+    )
+
+    auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy])
+    return auto_wrap_policy
+
+
+def transpose(weight, fan_in_fan_out):
+    if not fan_in_fan_out:
+        return weight
+
+    if isinstance(weight, torch.nn.Parameter):
+        return torch.nn.Parameter(weight.T)
+    return weight.T
+
+
+def _is_valid_match(key: str, target_key: str):
+    """
+    Helper function to match module names target_key and key. Makes sure that either the key is exactly the target_key
+    or the target_key is a submodule of key
+    """
+    if key.endswith(target_key):
+        if len(key) > len(target_key):
+            return key.endswith("." + target_key)  # must be a sub module
+        return True
+    return False
+
+
+def _get_batch_size(input_ids: Optional[torch.Tensor], inputs_embeds: Optional[torch.Tensor]) -> int:
+    """Get the batch size based on either input_ids or input_embeds
+
+    Raises an ValueError if both are None.
+
+    """
+    if (input_ids is None) and (inputs_embeds is None):
+        raise ValueError("You have to provide either input_ids or inputs_embeds")
+
+    if input_ids is not None:
+        batch_size = input_ids.shape[0]
+    else:
+        batch_size = inputs_embeds.shape[0]
+    return batch_size
+
+
+def get_quantization_config(model: torch.nn.Module, method: str):
+    """
+    Get the quantization config of the related quantization method
+    """
+    if (
+        hasattr(model, "config")
+        and hasattr(model.config, "quantization_config")
+        and (getattr(model, "quantization_method", None) == method)
+    ):
+        return model.config.quantization_config
+    return None
+
+
+def get_auto_gptq_quant_linear(gptq_quantization_config):
+    """
+    Get the right AutoGPTQQuantLinear class based on the quantization config file
+    """
+    if gptq_quantization_config is not None and is_auto_gptq_available():
+        from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+
+        desc_act = gptq_quantization_config.desc_act
+        group_size = gptq_quantization_config.group_size
+        bits = gptq_quantization_config.bits
+        if hasattr(gptq_quantization_config, "use_exllama"):
+            use_exllama = gptq_quantization_config.use_exllama
+        else:
+            use_exllama = not gptq_quantization_config.disable_exllama
+        if hasattr(gptq_quantization_config, "exllama_config"):
+            exllama_version = gptq_quantization_config.exllama_config["version"]
+        else:
+            exllama_version = 1
+        AutoGPTQQuantLinear = dynamically_import_QuantLinear(
+            use_triton=False,
+            desc_act=desc_act,
+            group_size=group_size,
+            bits=bits,
+            disable_exllama=not (use_exllama and exllama_version == 1),
+            disable_exllamav2=not (use_exllama and exllama_version == 2),
+        )
+        return AutoGPTQQuantLinear
+    return None
+
+
+def id_tensor_storage(tensor: torch.Tensor) -> Tuple[torch.device, int, int]:
+    """
+    Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
+    example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
+    guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
+    non-overlapping lifetimes may have the same id.
+
+    This method is the exact same copy of
+    https://github.com/huggingface/transformers/blob/main/src/transformers/pytorch_utils.py#L282C1-L300C58 but we added
+    it here manually to avoid import issue with old versions of transformers.
+    """
+    if tensor.device.type == "xla" and is_torch_tpu_available():
+        # NOTE: xla tensors dont have storage
+        # use some other unique id to distinguish.
+        # this is a XLA tensor, it must be created using torch_xla's
+        # device. So the following import is safe:
+        import torch_xla
+
+        unique_id = torch_xla._XLAC._xla_get_tensor_id(tensor)
+    else:
+        unique_id = storage_ptr(tensor)
+
+    return tensor.device, unique_id, storage_size(tensor)
+
+
+def cast_mixed_precision_params(model, dtype):
+    """
+    Cast all non-trainable parameters of the model to the given `dtype`. The `dtype` can be `torch.float16` or
+    `torch.bfloat16` as per the mixed-precision training you are performing. The trainable parameters are cast to full
+    precision. This is meant to reduce the GPU memory usage when using PEFT methods by using half-precision dtype for
+    non-trainable parameters. Having the trainable parameters in full-precision preserves training stability when using
+    automatic mixed-precision training.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to cast the non-trainable parameters of.
+        dtype (`torch.dtype`):
+            The dtype to cast the non-trainable parameters to. The `dtype` can be `torch.float16` or
+    `torch.bfloat16` as per the mixed-precision training you are performing.
+    """
+    for p in model.parameters():
+        if not p.requires_grad:
+            p.data = p.to(dtype)
+        else:
+            p.data = p.to(torch.float32)
+
+
+def str_to_bool(value: str) -> int:
+    """
+    Converts a string representation of truth to `True` (1) or `False` (0).
+
+    True values are `y`, `yes`, `t`, `true`, `on`, and `1`; False value are `n`, `no`, `f`, `false`, `off`, and `0`;
+    """
+    # same as function as in accelerate.utils, which replaces the deprecated distutils.util.strtobool
+    value = value.lower()
+    if value in ("y", "yes", "t", "true", "on", "1"):
+        return 1
+    elif value in ("n", "no", "f", "false", "off", "0"):
+        return 0
+    else:
+        raise ValueError(f"invalid truth value {value}")
+
+
+def check_file_exists_on_hf_hub(repo_id: str, filename: str, **kwargs) -> Optional[bool]:
+    """Check if a file exists on HF Hub, if check was not successful returns None instead of erroring.
+
+    Respect offline mode if set.
+
+    """
+    exists: Optional[bool] = None
+    if str_to_bool(os.environ.get("HF_HUB_OFFLINE", "0")):
+        # user set offline mode, cannot check
+        return exists
+
+    try:
+        exists = file_exists(repo_id, filename, **kwargs)
+    except (HFValidationError, EntryNotFoundError):
+        # error, exists stays None
+        pass
+    except Exception as e:
+        warnings.warn(
+            f"Unable to fetch remote file due to the following error {e} - silently ignoring the lookup"
+            f" for the file {filename} in {repo_id}."
+        )
+
+    return exists
diff --git a/MoRA/peft_mora/utils/peft_types.py b/MoRA/peft_mora/utils/peft_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4a84435dc60bae6fde33bdbbc5e24f03293c678
--- /dev/null
+++ b/MoRA/peft_mora/utils/peft_types.py
@@ -0,0 +1,73 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all
+
+# coding=utf-8
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import enum
+
+
+class PeftType(str, enum.Enum):
+    """
+    Enum class for the different types of adapters in PEFT.
+
+    Supported PEFT types:
+    - PROMPT_TUNING
+    - MULTITASK_PROMPT_TUNING
+    - P_TUNING
+    - PREFIX_TUNING
+    - LORA
+    - ADALORA
+    - ADAPTION_PROMPT
+    - IA3
+    - LOHA
+    - LOKR
+    - OFT
+    """
+
+    PROMPT_TUNING = "PROMPT_TUNING"
+    MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING"
+    P_TUNING = "P_TUNING"
+    PREFIX_TUNING = "PREFIX_TUNING"
+    LORA = "LORA"
+    ADALORA = "ADALORA"
+    ADAPTION_PROMPT = "ADAPTION_PROMPT"
+    IA3 = "IA3"
+    LOHA = "LOHA"
+    LOKR = "LOKR"
+    OFT = "OFT"
+    POLY = "POLY"
+
+
+class TaskType(str, enum.Enum):
+    """
+    Enum class for the different types of tasks supported by PEFT.
+
+    Overview of the supported task types:
+    - SEQ_CLS: Text classification.
+    - SEQ_2_SEQ_LM: Sequence-to-sequence language modeling.
+    - CAUSAL_LM: Causal language modeling.
+    - TOKEN_CLS: Token classification.
+    - QUESTION_ANS: Question answering.
+    - FEATURE_EXTRACTION: Feature extraction. Provides the hidden states which can be used as embeddings or features
+      for downstream tasks.
+    """
+
+    SEQ_CLS = "SEQ_CLS"
+    SEQ_2_SEQ_LM = "SEQ_2_SEQ_LM"
+    CAUSAL_LM = "CAUSAL_LM"
+    TOKEN_CLS = "TOKEN_CLS"
+    QUESTION_ANS = "QUESTION_ANS"
+    FEATURE_EXTRACTION = "FEATURE_EXTRACTION"
diff --git a/MoRA/peft_mora/utils/save_and_load.py b/MoRA/peft_mora/utils/save_and_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac1264d89afb82d3ab3eadccb49163a75284aaa
--- /dev/null
+++ b/MoRA/peft_mora/utils/save_and_load.py
@@ -0,0 +1,330 @@
+# Copyright 2023-present the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import warnings
+from typing import Optional
+
+import torch
+from huggingface_hub import file_exists, hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+from safetensors.torch import load_file as safe_load_file
+
+from .other import (
+    EMBEDDING_LAYER_NAMES,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    check_file_exists_on_hf_hub,
+    infer_device,
+)
+from .peft_types import PeftType
+
+
+def has_valid_embedding_base_layer(layer):
+    """Check if the layer has an embedding base layer"""
+    return hasattr(layer, "base_layer") and isinstance(layer.base_layer, (torch.nn.Linear, torch.nn.Embedding))
+
+
+def get_embedding_layer_name(model, layer, is_embedding_in_target_modules):
+    """Get the name of the embedding module for a given layer."""
+    for name, module in model.named_modules():
+        if (not is_embedding_in_target_modules and module == layer) or module == getattr(layer, "base_layer", None):
+            return name
+    return None
+
+
+def get_peft_model_state_dict(
+    model, state_dict=None, adapter_name="default", unwrap_compiled=False, save_embedding_layers="auto"
+):
+    """
+    Get the state dict of the Peft model.
+
+    Args:
+        model ([`PeftModel`]): The Peft model. When using torch.nn.DistributedDataParallel, DeepSpeed or FSDP,
+            the model should be the underlying model/unwrapped model (i.e. model.module).
+        state_dict (`dict`, *optional*, defaults to `None`):
+            The state dict of the model. If not provided, the state dict of the passed model will be used.
+        adapter_name (`str`, *optional*, defaults to `"default"`):
+            The name of the adapter whose state dict should be returned.
+        unwrap_compiled (`bool`, *optional*, defaults to `False`):
+            Whether to unwrap the model if torch.compile was used.
+        save_embedding_layers (`Union[bool, str]`, , *optional*, defaults to `auto`):
+            If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common embedding
+            layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available. Based on it
+            sets the boolean flag. This only works for 🤗 transformers models.
+    """
+    if unwrap_compiled:
+        model = getattr(model, "_orig_mod", model)
+
+    config = model.peft_config[adapter_name]
+    if state_dict is None:
+        state_dict = model.state_dict()
+    if config.peft_type in (PeftType.LORA, PeftType.ADALORA):
+        # to_return = lora_state_dict(model, bias=model.peft_config.bias)
+        # adapted from `https://github.com/microsoft/LoRA/blob/main/loralib/utils.py`
+        # to be used directly with the state dict which is necessary when using DeepSpeed or FSDP
+        bias = config.bias
+        if bias == "none":
+            to_return = {k: state_dict[k] for k in state_dict if "lora_" in k}
+        elif bias == "all":
+            to_return = {k: state_dict[k] for k in state_dict if "lora_" in k or "bias" in k}
+        elif bias == "lora_only":
+            to_return = {}
+            for k in state_dict:
+                if "lora_" in k:
+                    to_return[k] = state_dict[k]
+                    bias_name = k.split("lora_")[0] + "bias"
+                    if bias_name in state_dict:
+                        to_return[bias_name] = state_dict[bias_name]
+        else:
+            raise NotImplementedError
+        to_return = {k: v for k, v in to_return.items() if (("lora_" in k and adapter_name in k) or ("bias" in k))}
+        if config.peft_type == PeftType.ADALORA:
+            rank_pattern = config.rank_pattern
+            if rank_pattern is not None:
+                rank_pattern = {k.replace(f".{adapter_name}", ""): v for k, v in rank_pattern.items()}
+                config.rank_pattern = rank_pattern
+                to_return = model.resize_state_dict_by_rank_pattern(rank_pattern, to_return, adapter_name)
+
+    elif config.peft_type == PeftType.LOHA:
+        to_return = {k: state_dict[k] for k in state_dict if "hada_" in k}
+
+    elif config.peft_type == PeftType.LOKR:
+        to_return = {k: state_dict[k] for k in state_dict if "lokr_" in k}
+
+    elif config.peft_type == PeftType.ADAPTION_PROMPT:
+        to_return = {k: state_dict[k] for k in state_dict if k.split(".")[-1].startswith("adaption_")}
+    elif config.is_prompt_learning:
+        to_return = {}
+        if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+            to_return["prefix_task_cols"] = model.prompt_encoder[adapter_name].prefix_task_cols
+            to_return["prefix_task_rows"] = model.prompt_encoder[adapter_name].prefix_task_rows
+            prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight
+        else:
+            if config.inference_mode:
+                prompt_embeddings = model.prompt_encoder[adapter_name].embedding.weight
+            else:
+                prompt_embeddings = model.get_prompt_embedding_to_save(adapter_name)
+        to_return["prompt_embeddings"] = prompt_embeddings
+    elif config.peft_type == PeftType.IA3:
+        to_return = {k: state_dict[k] for k in state_dict if "ia3_" in k}
+    elif config.peft_type == PeftType.OFT:
+        to_return = {k: state_dict[k] for k in state_dict if "oft_" in k}
+    elif config.peft_type == PeftType.POLY:
+        to_return = {k: state_dict[k] for k in state_dict if "poly_" in k}
+    else:
+        raise NotImplementedError
+    if getattr(model, "modules_to_save", None) is not None:
+        for key, value in state_dict.items():
+            if any(f"{module_name}.modules_to_save.{adapter_name}" in key for module_name in model.modules_to_save):
+                to_return[key.replace("modules_to_save.", "")] = value
+
+    # check the common embedding layers in `target_modules` to reset `save_embedding_layers` if necessary
+    is_embedding_in_target_modules = False
+    if (
+        save_embedding_layers == "auto"
+        and hasattr(config, "target_modules")
+        and any(k in config.target_modules for k in EMBEDDING_LAYER_NAMES)
+    ):
+        warnings.warn("Setting `save_embedding_layers` to `True` as embedding layers found in `target_modules`.")
+        save_embedding_layers = is_embedding_in_target_modules = True
+    elif save_embedding_layers == "auto":
+        vocab_size = getattr(getattr(model, "config", None), "vocab_size", None)
+        model_id = getattr(config, "base_model_name_or_path", None)
+
+        # For some models e.g. diffusers the text config file is stored in a subfolder
+        # we need to make sure we can download that config.
+        has_remote_config = False
+
+        # ensure that this check is not performed in HF offline mode, see #1452
+        if model_id is not None:
+            exists = check_file_exists_on_hf_hub(model_id, "config.json")
+            if exists is None:
+                # check failed, could not determine if it exists or not
+                warnings.warn(
+                    f"Could not find a config file in {model_id} - will assume that the vocabulary was not modified."
+                )
+                has_remote_config = False
+            else:
+                has_remote_config = exists
+
+        # check if the vocab size of the base model is different from the vocab size of the finetuned model
+        if (
+            vocab_size
+            and model_id
+            and has_remote_config
+            and (vocab_size != model.config.__class__.from_pretrained(model_id).vocab_size)
+        ):
+            warnings.warn(
+                "Setting `save_embedding_layers` to `True` as the embedding layer has been resized during finetuning."
+            )
+            save_embedding_layers = True
+        else:
+            save_embedding_layers = False
+
+    if save_embedding_layers and hasattr(model, "get_input_embeddings"):
+        for layer in [model.get_input_embeddings(), model.get_output_embeddings()]:
+            if not is_embedding_in_target_modules or has_valid_embedding_base_layer(layer):
+                # support from version >= 0.6.2
+                embedding_module_name = get_embedding_layer_name(model, layer, is_embedding_in_target_modules)
+                if embedding_module_name:
+                    to_return.update({k: v for k, v in state_dict.items() if embedding_module_name in k})
+    elif save_embedding_layers:
+        warnings.warn("Could not identify embedding layer(s) because the model is not a 🤗 transformers model.")
+
+    to_return = {k.replace(f".{adapter_name}", ""): v for k, v in to_return.items()}
+    return to_return
+
+
+def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default"):
+    """
+    Set the state dict of the Peft model.
+
+    Args:
+        model ([`PeftModel`]): The Peft model.
+        peft_model_state_dict (`dict`): The state dict of the Peft model.
+    """
+    config = model.peft_config[adapter_name]
+    state_dict = {}
+    if getattr(model, "modules_to_save", None) is not None:
+        for key, value in peft_model_state_dict.items():
+            if any(module_name in key for module_name in model.modules_to_save):
+                for module_name in model.modules_to_save:
+                    if module_name in key:
+                        key = key.replace(module_name, f"{module_name}.modules_to_save.{adapter_name}")
+                        break
+            state_dict[key] = value
+    else:
+        state_dict = peft_model_state_dict
+
+    if config.peft_type in (
+        PeftType.LORA,
+        PeftType.LOHA,
+        PeftType.LOKR,
+        PeftType.ADALORA,
+        PeftType.IA3,
+        PeftType.OFT,
+        PeftType.POLY,
+    ):
+        peft_model_state_dict = {}
+        parameter_prefix = {
+            PeftType.IA3: "ia3_",
+            PeftType.LORA: "lora_",
+            PeftType.ADALORA: "lora_",
+            PeftType.LOHA: "hada_",
+            PeftType.LOKR: "lokr_",
+            PeftType.OFT: "oft_",
+            PeftType.POLY: "poly_",
+        }[config.peft_type]
+        for k, v in state_dict.items():
+            if parameter_prefix in k:
+                suffix = k.split(parameter_prefix)[1]
+                if "." in suffix:
+                    suffix_to_replace = ".".join(suffix.split(".")[1:])
+                    k = k.replace(suffix_to_replace, f"{adapter_name}.{suffix_to_replace}")
+                else:
+                    k = f"{k}.{adapter_name}"
+                peft_model_state_dict[k] = v
+            else:
+                peft_model_state_dict[k] = v
+        if config.peft_type == PeftType.ADALORA:
+            rank_pattern = config.rank_pattern
+            if rank_pattern is not None:
+                model.resize_modules_by_rank_pattern(rank_pattern, adapter_name)
+    elif config.is_prompt_learning or config.peft_type == PeftType.ADAPTION_PROMPT:
+        peft_model_state_dict = state_dict
+    else:
+        raise NotImplementedError
+
+    load_result = model.load_state_dict(peft_model_state_dict, strict=False)
+    if config.is_prompt_learning:
+        model.prompt_encoder[adapter_name].embedding.load_state_dict(
+            {"weight": peft_model_state_dict["prompt_embeddings"]}, strict=True
+        )
+
+    if config.peft_type == PeftType.MULTITASK_PROMPT_TUNING:
+        model.prompt_encoder[adapter_name].load_state_dict(peft_model_state_dict, strict=False)
+    return load_result
+
+
+def load_peft_weights(model_id: str, device: Optional[str] = None, **hf_hub_download_kwargs) -> dict:
+    r"""
+    A helper method to load the PEFT weights from the HuggingFace Hub or locally
+
+    Args:
+        model_id (`str`):
+            The local path to the adapter weights or the name of the adapter to load from the HuggingFace Hub.
+        device (`str`):
+            The device to load the weights onto.
+        hf_hub_download_kwargs (`dict`):
+            Additional arguments to pass to the `hf_hub_download` method when loading from the HuggingFace Hub.
+    """
+    path = (
+        os.path.join(model_id, hf_hub_download_kwargs["subfolder"])
+        if hf_hub_download_kwargs.get("subfolder", None) is not None
+        else model_id
+    )
+
+    if device is None:
+        device = infer_device()
+
+    if os.path.exists(os.path.join(path, SAFETENSORS_WEIGHTS_NAME)):
+        filename = os.path.join(path, SAFETENSORS_WEIGHTS_NAME)
+        use_safetensors = True
+    elif os.path.exists(os.path.join(path, WEIGHTS_NAME)):
+        filename = os.path.join(path, WEIGHTS_NAME)
+        use_safetensors = False
+    else:
+        token = hf_hub_download_kwargs.get("token", None)
+        if token is None:
+            token = hf_hub_download_kwargs.get("use_auth_token", None)
+
+        hub_filename = (
+            os.path.join(hf_hub_download_kwargs["subfolder"], SAFETENSORS_WEIGHTS_NAME)
+            if hf_hub_download_kwargs.get("subfolder", None) is not None
+            else SAFETENSORS_WEIGHTS_NAME
+        )
+        has_remote_safetensors_file = file_exists(
+            repo_id=model_id,
+            filename=hub_filename,
+            revision=hf_hub_download_kwargs.get("revision", None),
+            repo_type=hf_hub_download_kwargs.get("repo_type", None),
+            token=token,
+        )
+        use_safetensors = has_remote_safetensors_file
+
+        if has_remote_safetensors_file:
+            # Priority 1: load safetensors weights
+            filename = hf_hub_download(
+                model_id,
+                SAFETENSORS_WEIGHTS_NAME,
+                **hf_hub_download_kwargs,
+            )
+        else:
+            try:
+                filename = hf_hub_download(model_id, WEIGHTS_NAME, **hf_hub_download_kwargs)
+            except EntryNotFoundError:
+                raise ValueError(
+                    f"Can't find weights for {model_id} in {model_id} or in the Hugging Face Hub. "
+                    f"Please check that the file {WEIGHTS_NAME} or {SAFETENSORS_WEIGHTS_NAME} is present at {model_id}."
+                )
+
+    if use_safetensors:
+        if hasattr(torch.backends, "mps") and (device == torch.device("mps")):
+            adapters_weights = safe_load_file(filename, device="cpu")
+        else:
+            adapters_weights = safe_load_file(filename, device=device)
+    else:
+        adapters_weights = torch.load(filename, map_location=torch.device(device))
+
+    return adapters_weights
diff --git a/MoRA/requirements.txt b/MoRA/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..efecf06be3ec2d075848d78c76ca12e1ff99f8d4
--- /dev/null
+++ b/MoRA/requirements.txt
@@ -0,0 +1,7 @@
+transformers==4.37.2
+bitsandbytes==0.39.0
+deepspeed==0.11.1
+accelerate @ git+https://github.com/huggingface/accelerate.git@bd72a5f1a80d5146554458823f8aeda0a9db5297
+fire==0.6.0
+tqdm==4.62.3
+torch==2.1.1+cu121
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6ced7caa793eae17b5794e39e60064a550a05aab
--- /dev/null
+++ b/README.md
@@ -0,0 +1,93 @@
+---
+base_model: Gunulhona/Gemma-System-9B
+library_name: peft
+---
+
+# Gemma-System-9B with MoRA + SimPO
+
+<!-- Provide a quick summary of what the model is/does. -->
+This is a SimPO finetuned version of Gemma-System-9B using MoRA (Mixture of Rank Adaptation) for preference alignment. The model is trained to better align with human preferences through direct preference optimization.
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+This model is a finetuned version of Gemma-System-9B using SimPO (Simple Preference Optimization) training method. The model uses MoRA adaptation with rank 256 to efficiently finetune the base model while maintaining its core capabilities.
+
+- **Developed by:** [Original: Merged Gemma-2-9B-it, Finetuned: Gunulhona]
+- **Model type:** Causal Language Model with MoRA adaptation
+- **Language(s):** Primarily English and Korean
+- **License:** Same as base model (Gemma-System-9B)
+- **Finetuned from model:** Gunulhona/Gemma-System-9B
+
+## Training Details
+
+### Training Procedure
+
+#### Training Hyperparameters
+
+- **Training regime:** bfloat16 mixed precision
+- **Learning rate:** 5e-7
+- **Batch size per device:** 1
+- **Gradient accumulation steps:** 16
+- **Total batch size:** 16
+- **Number of epochs:** 200
+- **Optimizer:** AdamW with cosine restarts scheduler
+- **Loss type:** SimPO (configurable)
+- **Beta (SimPO):** 10.0
+- **SimPO gamma:** 0.5
+- **Maximum sequence length:** 65,536 tokens
+
+#### MoRA Configuration
+- **Rank (r):** 256
+- **Alpha:** 16
+- **Dropout:** 0.05
+- **MoRA type:** 6
+- **Target modules:**
+  - q_proj
+  - k_proj
+  - v_proj
+  - o_proj
+  - gate_proj
+  - down_proj
+  - up_proj
+
+### Training Data
+
+The model was trained on the "Gunulhona/open_dpo_merged" dataset, which contains pairs of preferred and non-preferred responses for preference learning.
+
+## Technical Specifications
+
+### Model Architecture and Objective
+
+The model uses MoRA (Mixture of Rank Adaptation) for efficient parameter-efficient finetuning. It can be trained using either DPO or SimPO objectives:
+
+- **SimPO:** Simple Preference Optimization with β=10.0 and γ=0.5
+
+### Compute Infrastructure
+
+#### Hardware
+- Training performed on CUDA-capable GPUs
+- Uses DeepSpeed for distributed training
+- Gradient checkpointing enabled for memory efficiency
+
+#### Software
+- PEFT library for parameter-efficient finetuning
+- Transformers library
+- DeepSpeed for training optimization
+- Weights & Biases for experiment tracking
+
+## Environmental Impact
+
+- **Hardware Type:** NVIDIA GPUs
+- **Training Regime:** Mixed BF16 precision
+- **Optimization:** DeepSpeed + Gradient Checkpointing
+
+## Model Card Contact
+
+For questions about this model, please contact Gunulhona.
+
+### Framework versions
+
+- [PEFT 0.9.0](https://github.com/kongds/MoRA)
diff --git a/adapter_config.json b/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..de716385a591ce0998c73380ef6942cb51953a99
--- /dev/null
+++ b/adapter_config.json
@@ -0,0 +1,39 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": {
+    "AutoConfig": "MoRA.config.MoRAConfig",
+    "AutoModel": "MoRA.model.MoRAModel",
+    "AutoModelForCausalLM": "MoRA.model.MoRAModelForCausalLM"
+  },
+  "base_model_name_or_path": "Gunulhona/Gemma-System-9B",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "mora_type": 6,
+  "peft_type": "LORA",
+  "r": 256,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "k_proj",
+    "up_proj",
+    "o_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_mora": true,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/adapter_model.safetensors b/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..9ba1e99ebec7e911cd328a3fe8805d840ab1546f
--- /dev/null
+++ b/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:62f88bb8a58b8585a6eb9bc6ee5b1e31ae56c4cb87266681b6049daa50e13545
+size 6913516904
diff --git a/special_tokens_map.json b/special_tokens_map.json
new file mode 100644
index 0000000000000000000000000000000000000000..8d6368f7e735fbe4781bf6e956b7c6ad0586df80
--- /dev/null
+++ b/special_tokens_map.json
@@ -0,0 +1,34 @@
+{
+  "additional_special_tokens": [
+    "<start_of_turn>",
+    "<end_of_turn>"
+  ],
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
diff --git a/tokenizer.json b/tokenizer.json
new file mode 100644
index 0000000000000000000000000000000000000000..f58963a682665634ab180c28667e4faa8cf02ba2
--- /dev/null
+++ b/tokenizer.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f559f2189f392b4555613965f089e7c4d300b41fbe080bf79da0d676e33ee7f0
+size 34356041
diff --git a/tokenizer.model b/tokenizer.model
new file mode 100644
index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8
--- /dev/null
+++ b/tokenizer.model
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2
+size 4241003
diff --git a/tokenizer_config.json b/tokenizer_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..45db2b9123c19dd338c31a1403c77d3acb98b846
--- /dev/null
+++ b/tokenizer_config.json
@@ -0,0 +1,1757 @@
+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<eos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<bos>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "5": {
+      "content": "<2mass>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "6": {
+      "content": "[@BOS@]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "7": {
+      "content": "<unused0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "<unused1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "<unused2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "10": {
+      "content": "<unused3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "11": {
+      "content": "<unused4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "12": {
+      "content": "<unused5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "13": {
+      "content": "<unused6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "14": {
+      "content": "<unused7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "15": {
+      "content": "<unused8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "16": {
+      "content": "<unused9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "17": {
+      "content": "<unused10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "18": {
+      "content": "<unused11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "19": {
+      "content": "<unused12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "20": {
+      "content": "<unused13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "21": {
+      "content": "<unused14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "22": {
+      "content": "<unused15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "23": {
+      "content": "<unused16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "24": {
+      "content": "<unused17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "25": {
+      "content": "<unused18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "26": {
+      "content": "<unused19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "27": {
+      "content": "<unused20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "28": {
+      "content": "<unused21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "29": {
+      "content": "<unused22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "30": {
+      "content": "<unused23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "31": {
+      "content": "<unused24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "32": {
+      "content": "<unused25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "33": {
+      "content": "<unused26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "34": {
+      "content": "<unused27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "35": {
+      "content": "<unused28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "36": {
+      "content": "<unused29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "37": {
+      "content": "<unused30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "38": {
+      "content": "<unused31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "39": {
+      "content": "<unused32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "40": {
+      "content": "<unused33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "41": {
+      "content": "<unused34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "42": {
+      "content": "<unused35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "43": {
+      "content": "<unused36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "44": {
+      "content": "<unused37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "45": {
+      "content": "<unused38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "46": {
+      "content": "<unused39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "47": {
+      "content": "<unused40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "48": {
+      "content": "<unused41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "49": {
+      "content": "<unused42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50": {
+      "content": "<unused43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "51": {
+      "content": "<unused44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "52": {
+      "content": "<unused45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "53": {
+      "content": "<unused46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "54": {
+      "content": "<unused47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "55": {
+      "content": "<unused48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "56": {
+      "content": "<unused49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "57": {
+      "content": "<unused50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "58": {
+      "content": "<unused51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "59": {
+      "content": "<unused52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "60": {
+      "content": "<unused53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "61": {
+      "content": "<unused54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "62": {
+      "content": "<unused55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "63": {
+      "content": "<unused56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "64": {
+      "content": "<unused57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "65": {
+      "content": "<unused58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "66": {
+      "content": "<unused59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "67": {
+      "content": "<unused60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "68": {
+      "content": "<unused61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "69": {
+      "content": "<unused62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "70": {
+      "content": "<unused63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "71": {
+      "content": "<unused64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "72": {
+      "content": "<unused65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "73": {
+      "content": "<unused66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "74": {
+      "content": "<unused67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "75": {
+      "content": "<unused68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "76": {
+      "content": "<unused69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "77": {
+      "content": "<unused70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "78": {
+      "content": "<unused71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "79": {
+      "content": "<unused72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "80": {
+      "content": "<unused73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "81": {
+      "content": "<unused74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "82": {
+      "content": "<unused75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "83": {
+      "content": "<unused76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "84": {
+      "content": "<unused77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "85": {
+      "content": "<unused78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "86": {
+      "content": "<unused79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "87": {
+      "content": "<unused80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "88": {
+      "content": "<unused81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "89": {
+      "content": "<unused82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "90": {
+      "content": "<unused83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "91": {
+      "content": "<unused84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "92": {
+      "content": "<unused85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "93": {
+      "content": "<unused86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "94": {
+      "content": "<unused87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "95": {
+      "content": "<unused88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "96": {
+      "content": "<unused89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "97": {
+      "content": "<unused90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "98": {
+      "content": "<unused91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "99": {
+      "content": "<unused92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "100": {
+      "content": "<unused93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "101": {
+      "content": "<unused94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "102": {
+      "content": "<unused95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "103": {
+      "content": "<unused96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "104": {
+      "content": "<unused97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "105": {
+      "content": "<unused98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "106": {
+      "content": "<start_of_turn>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "107": {
+      "content": "<end_of_turn>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "108": {
+      "content": "\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "109": {
+      "content": "\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "110": {
+      "content": "\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "111": {
+      "content": "\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "112": {
+      "content": "\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "113": {
+      "content": "\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "114": {
+      "content": "\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "115": {
+      "content": "\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "116": {
+      "content": "\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "117": {
+      "content": "\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "118": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "119": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "120": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "121": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "122": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "123": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "124": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "125": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "126": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "127": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "128": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "129": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "130": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "131": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "132": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "133": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "134": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "135": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "136": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "137": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "138": {
+      "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "139": {
+      "content": "▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "140": {
+      "content": "▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "141": {
+      "content": "▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "142": {
+      "content": "▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "143": {
+      "content": "▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "144": {
+      "content": "▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "145": {
+      "content": "▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "146": {
+      "content": "▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "147": {
+      "content": "▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "148": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "149": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "150": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "152": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "153": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "154": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "155": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "156": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "157": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "158": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "159": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "160": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "161": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "162": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "163": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "164": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "165": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "166": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "167": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "168": {
+      "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "169": {
+      "content": "<table>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "170": {
+      "content": "<caption>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "171": {
+      "content": "<thead>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "172": {
+      "content": "<tbody>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "173": {
+      "content": "<tfoot>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "174": {
+      "content": "<tr>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "175": {
+      "content": "<th>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "176": {
+      "content": "<td>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "177": {
+      "content": "</table>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "178": {
+      "content": "</caption>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "179": {
+      "content": "</thead>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "180": {
+      "content": "</tbody>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "181": {
+      "content": "</tfoot>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "182": {
+      "content": "</tr>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "183": {
+      "content": "</th>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "184": {
+      "content": "</td>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "185": {
+      "content": "<h1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "186": {
+      "content": "<h2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "187": {
+      "content": "<h3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "188": {
+      "content": "<h4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "189": {
+      "content": "<h5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "190": {
+      "content": "<h6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "191": {
+      "content": "<blockquote>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "192": {
+      "content": "</h1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "193": {
+      "content": "</h2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "194": {
+      "content": "</h3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "195": {
+      "content": "</h4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "196": {
+      "content": "</h5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "197": {
+      "content": "</h6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "198": {
+      "content": "</blockquote>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "199": {
+      "content": "<strong>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "200": {
+      "content": "<em>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "201": {
+      "content": "<b>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "202": {
+      "content": "<i>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "203": {
+      "content": "<u>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "204": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "205": {
+      "content": "<sub>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "206": {
+      "content": "<sup>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "207": {
+      "content": "<code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "208": {
+      "content": "</strong>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "209": {
+      "content": "</em>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "210": {
+      "content": "</b>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "211": {
+      "content": "</i>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "212": {
+      "content": "</u>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "213": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "214": {
+      "content": "</sub>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "215": {
+      "content": "</sup>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "216": {
+      "content": "</code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<start_of_turn>",
+    "<end_of_turn>"
+  ],
+  "bos_token": "<bos>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if message['role'] not in ['user', 'assistant', 'system'] or (loop.index0 > 0 and message['role'] != 'system' and message['role'] == messages[loop.index0 - 1]['role']) %}{{ raise_exception('Invalid role or role sequence') }}{% endif %}{{ '<start_of_turn>' + message['role'] + '\n' + message['content'] | trim + '<end_of_turn>\n' if message['role'] != 'system' else message['content'] + '\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<start_of_turn>model\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "model_max_length": 2048,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}