本文介绍通过微调 Meta 的 Llama 2 7B 模型对18 个不同类别的新闻文章进行分类,本教程将详细解释每个步骤,涵盖使用的所有类、函数和参数。
安装所需库
!pip install -q accelerate==0.21.0 --progress-bar off
!pip install -q peft==0.4.0 --progress-bar off
!pip install -q bitsandbytes==0.40.2 --progress-bar off
!pip install -q transformers==4.31.0 --progress-bar off
!pip install -q trl==0.4.7 --progress-bar off
加载所需库
import os
from random import randrange
from functools import partial
import torch
from datasets import load_dataset
from transformers import (AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
Trainer,
TrainingArguments,
DataCollatorForLanguageModeling,
EarlyStoppingCallback,
pipeline,
logging,
set_seed)
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer
Bitsandbytes 配置
定义一个函数 create_bnb_config
来定义 bitsandbytes
配置。模型量化是一种用于通过减少用于表示其权重和激活的位数来压缩深度学习模型的技术,更快的推理并减少内存消耗,从而可以在资源有限的边缘设备上部署这些模型。
load_in_4bit
:以 4 位精度加载模型,即将内存使用量除以 4。
bnb_4bit_use_double_quant
:使用嵌套量化技术进行更节省内存的推理,无需额外费用。
bnb_4bit_quant_type
:设置量化数据类型。这些选项是 FP4(4 位精度),这是默认的量化数据类型,或者 NF4(正态浮点数 4),适用于已使用正态分布初始化的权重。
bnb_4bit_compute_dtype
:设置 4 位模型的计算数据类型。默认值:火炬.float32
def create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype):
"""
Configures model quantization method using bitsandbytes to speed up training and inference
:param load_in_4bit: Load model in 4-bit precision mode
:param bnb_4bit_use_double_quant: Nested quantization for 4-bit model
:param bnb_4bit_quant_type: Quantization data type for 4-bit model
:param bnb_4bit_compute_dtype: Computation data type for 4-bit model
"""
bnb_config = BitsAndBytesConfig(
load_in_4bit = load_in_4bit,
bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
bnb_4bit_quant_type = bnb_4bit_quant_type,
bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
)
return bnb_config
加载 Model and Tokenizer
获取可用的 GPU 数量,并设置最大 GPU 内存。使用 device_map
将模型发送到指定设备,device_map = auto
将让 accelerate
自动分配。
def load_model(model_name, bnb_config):
"""
Loads model and model tokenizer
:param model_name: Hugging Face model name
:param bnb_config: Bitsandbytes configuration
"""
# Get number of GPU device and set maximum memory
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'
# Load model
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config = bnb_config,
device_map = "auto", # dispatch the model efficiently on the available resources
max_memory = {i: max_memory for i in range(n_gpus)},
)
# Load model tokenizer with the user authentication token
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token = True)
# Set padding token as EOS token
tokenizer.pad_token = tokenizer.eos_token
return model, tokenizer
初始化参数
################################################################################
# transformers parameters
################################################################################
# The pre-trained model from the Hugging Face Hub to load and fine-tune
model_name = "meta-llama/Llama-2-7b-hf"
################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
load_in_4bit = True
# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Compute data type for 4-bit base models
bnb_4bit_compute_dtype = torch.bfloat16
# Load model from Hugging Face Hub with model name and bitsandbytes configuration
bnb_config = create_bnb_config(load_in_4bit, bnb_4bit_use_double_quant, bnb_4bit_quant_type, bnb_4bit_compute_dtype)
model, tokenizer = load_model(model_name, bnb_config)
加载数据集
# The instruction dataset to use
dataset_name = "文件路径/news_classification.csv"
# Load dataset
dataset = load_dataset("csv", data_files = dataset_name, split = "train")
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')
创建提示模板
def create_prompt_formats(sample):
"""
Creates a formatted prompt template for a prompt in the instruction dataset
:param sample: Prompt or sample from the instruction dataset
"""
# Initialize static strings for the prompt template
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"
# Combine a prompt with the static strings
blurb = f"{INTRO_BLURB}"
instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
input_context = f"{INPUT_KEY}\n{sample['input']}" if sample["input"] else None
response = f"{RESPONSE_KEY}\n{sample['output']}"
end = f"{END_KEY}"
# Create a list of prompt template elements
parts = [part for part in [blurb, instruction, input_context, response, end] if part]
# Join prompt template elements into a single string to create the prompt template
formatted_prompt = "\n\n".join(parts)
# Store the formatted prompt template in a new key "text"
sample["text"] = formatted_prompt
return sample
print(create_prompt_formats(dataset[randrange(len(dataset))]))
获取预训练模型的最大长度
定义函数以 get_max_length
找出 Llama-2-7B 模型的最大长度。此函数将拉取模型配置,并尝试从可能包含它的几个配置键之一中找到最大长度。如果未找到最大长度,则默认为 1024。我们将在数据集预处理期间使用最大长度来删除超过该上下文长度的记录
def get_max_length(model):
"""
Extracts maximum token length from the model configuration
:param model: Hugging Face model
"""
# Pull model configuration
conf = model.config
# Initialize a "max_length" variable to store maximum sequence length as null
max_length = None
# Find maximum sequence length in the model configuration and save it in "max_length" if found
for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
max_length = getattr(model.config, length_setting, None)
if max_length:
print(f"Found max lenth: {max_length}")
break
# Set "max_length" to 1024 (default value) if maximum sequence length is not found in the model configuration
if not max_length:
max_length = 1024
print(f"Using default max length: {max_length}")
return max_length
批处理数据集Tokenizing
def preprocess_batch(batch, tokenizer, max_length):
"""
Tokenizes dataset batch
:param batch: Dataset batch
:param tokenizer: Model tokenizer
:param max_length: Maximum number of tokens to emit from the tokenizer
"""
return tokenizer(
batch["text"],
max_length = max_length,
truncation = True,
)
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
"""
Tokenizes dataset for fine-tuning
:param tokenizer (AutoTokenizer): Model tokenizer
:param max_length (int): Maximum number of tokens to emit from the tokenizer
:param seed: Random seed for reproducibility
:param dataset (str): Instruction dataset
"""
# Add prompt to each sample
print("Preprocessing dataset...")
dataset = dataset.map(create_prompt_formats)
# Apply preprocessing to each batch of the dataset & and remove "instruction", "input", "output", and "text" fields
_preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)
dataset = dataset.map(
_preprocessing_function,
batched = True,
remove_columns = ["instruction", "input", "output", "text"],
)
# Filter out samples that have "input_ids" exceeding "max_length"
dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
# Shuffle dataset
dataset = dataset.shuffle(seed = seed)
return dataset
# Random seed
seed = 33
max_length = get_max_length(model)
preprocessed_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)
print(preprocessed_dataset)
print(preprocessed_dataset[0])
创建 PEFT 配置
PEFT 库提供最新的参数高效微调技术。QLoRA 使用 4 位量化来压缩预训练的语言模型,然后冻结 LM 参数,并以低秩适配器的形式将相对较少的可训练参数添加到模型中。在微调过程中,QLoRA 通过冻结的 4 位量化预训练语言模型将梯度反向传播到低秩适配器中,LoRA 层是在训练期间更新的唯一参数。
def create_peft_config(r, lora_alpha, target_modules, lora_dropout, bias, task_type):
"""
Creates Parameter-Efficient Fine-Tuning configuration for the model
:param r: LoRA attention dimension
:param lora_alpha: Alpha parameter for LoRA scaling
:param modules: Names of the modules to apply LoRA to
:param lora_dropout: Dropout Probability for LoRA layers
:param bias: Specifies if the bias parameters should be trained
"""
config = LoraConfig(
r = r,
lora_alpha = lora_alpha,
target_modules = target_modules,
lora_dropout = lora_dropout,
bias = bias,
task_type = task_type,
)
return config
LoRA 应用程序的模块
def find_all_linear_names(model):
"""
Find modules to apply LoRA to.
:param model: PEFT model
"""
cls = bnb.nn.Linear4bit
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, cls):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names:
lora_module_names.remove('lm_head')
print(f"LoRA module names: {list(lora_module_names)}")
return list(lora_module_names)
计算可训练参数
print_trainable_parameters
函数来找出可训练模型参数的数量和百分比。此函数将计算 中的 model.named_parameters()
总参数数,然后计算将要更新的参数数。
def print_trainable_parameters(model, use_4bit = False):
"""
Prints the number of trainable parameters in the model.
:param model: PEFT model
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
num_params = param.numel()
if num_params == 0 and hasattr(param, "ds_numel"):
num_params = param.ds_numel
all_param += num_params
if param.requires_grad:
trainable_params += num_params
if use_4bit:
trainable_params /= 2
print(
f"All Parameters: {all_param:,d} || Trainable Parameters: {trainable_params:,d} || Trainable Parameters %: {100 * trainable_params / all_param}"
)
微调预训练模型
启用gradient checkpoint减少微调期间的内存使用量,使用 PEFT 的 prepare_model_for_kbit_training
函数准备模型。调用 find_all_linear_names' 以获取要应用 LoRA 的模块名称,用 create_peft_config
函数创建 LoRA 配置,使用该 get_peft_model
函数微调到 PEFT。
def fine_tune(model,
tokenizer,
dataset,
lora_r,
lora_alpha,
lora_dropout,
bias,
task_type,
per_device_train_batch_size,
gradient_accumulation_steps,
warmup_steps,
max_steps,
learning_rate,
fp16,
logging_steps,
output_dir,
optim):
"""
Prepares and fine-tune the pre-trained model.
:param model: Pre-trained Hugging Face model
:param tokenizer: Model tokenizer
:param dataset: Preprocessed training dataset
"""
# Enable gradient checkpointing to reduce memory usage during fine-tuning
model.gradient_checkpointing_enable()
# Prepare the model for training
model = prepare_model_for_kbit_training(model)
# Get LoRA module names
target_modules = find_all_linear_names(model)
# Create PEFT configuration for these modules and wrap the model to PEFT
peft_config = create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type)
model = get_peft_model(model, peft_config)
# Print information about the percentage of trainable parameters
print_trainable_parameters(model)
# Training parameters
trainer = Trainer(
model = model,
train_dataset = dataset,
args = TrainingArguments(
per_device_train_batch_size = per_device_train_batch_size,
gradient_accumulation_steps = gradient_accumulation_steps,
warmup_steps = warmup_steps,
max_steps = max_steps,
learning_rate = learning_rate,
fp16 = fp16,
logging_steps = logging_steps,
output_dir = output_dir,
optim = optim,
),
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
)
model.config.use_cache = False
do_train = True
# Launch training and log metrics
print("Training...")
if do_train:
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
print(metrics)
# Save model
print("Saving last checkpoint of the model...")
os.makedirs(output_dir, exist_ok = True)
trainer.model.save_pretrained(output_dir)
# Free memory for merging weights
del model
del trainer
torch.cuda.empty_cache()
初始化下面的 QLoRA 和训练参数参数以进行训练。
################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension
lora_r = 16
# Alpha parameter for LoRA scaling
lora_alpha = 64
# Dropout probability for LoRA layers
lora_dropout = 0.1
# Bias
bias = "none"
# Task type
task_type = "CAUSAL_LM"
################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"
# Batch size per GPU for training
per_device_train_batch_size = 1
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4
# Optimizer to use
optim = "paged_adamw_32bit"
# Number of training steps (overrides num_train_epochs)
max_steps = 20
# Linear warmup steps from 0 to learning_rate
warmup_steps = 2
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
# Log every X updates steps
logging_steps = 1
调用下面的 fine_tune
函数来微调或指令调整我们预处理的新闻分类指令数据集上的预训练模型。
fine_tune(model, tokenizer, preprocessed_dataset, lora_r, lora_alpha, lora_dropout, bias, task_type, per_device_train_batch_size, gradient_accumulation_steps, warmup_steps, max_steps, learning_rate, fp16, logging_steps, output_dir, optim)
我们可以通过添加到 EarlyStoppingCallback
trainer
定期在验证数据集上评估模型并仅保留最佳权重。
合并权重
# Load fine-tuned weights
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map = "auto", torch_dtype = torch.bfloat16)
# Merge the LoRA layers with the base model
model = model.merge_and_unload()
# Save fine-tuned model at a new location
output_merged_dir = "results/news_classification_llama2_7b/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok = True)
model.save_pretrained(output_merged_dir, safe_serialization = True)
# Save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)
# Fine-tuned model name on Hugging Face Hub
new_model = "用户名/模型名llama-2-7b"
# Push fine-tuned model and tokenizer to Hugging Face Hub
model.push_to_hub(new_model, use_auth_token = True)
tokenizer.push_to_hub(new_model, use_auth_token = True)