👁关注👉🏻 程旭源,我们一起向上生长
前言简介
Pointer Generator Network结构[2]
数据集
模型选型
数据预处理 模型训练和评测 因为Transformers包已经帮我们封装好了模型、损失函数等内容,我们只需调用并定义好训练循环即可: 模型保存 优化器我们选使用AdamW,并且通过 在每一个epoch中,调用上面定义的train_loop和test_loop,模型在验证集上的rouge分数用来调整超参数和选出最好的模型,最后使用最好的模型跑测一下测试集来评估最终的性能。 总结 2022-12-31
2022-12-07
2022-12-05
Text Summarization with Pretrained Encoders:https://arxiv.org/abs/1908.08345 [2] Get To The Point: Summarization with Pointer-Generator Networks:http://arxiv.org/abs/1704.04368 [3] BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension:https://aclanthology.org/2020.acl-main.703.pdf [4] BRIO: Bringing Order to Abstractive Summarization:https://arxiv.org/abs/2203.16804v1 [5] GSum: A General Framework for Guided Neural Abstractive Summarization:https://arxiv.org/abs/2010.08014 [6] SimCLS: A Simple Framework for Contrastive Learning of Abstractive Summarization:https://arxiv.org/abs/2106.01890v1 [7] Abstractive Summarization with Combination of Pre-trained Sequence-to-Sequence and Saliency Models:https://arxiv.org/abs/2003.13028 [8] XL-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages:https://github.com/csebuetnlp/xl-sum [9] mT5: A massively multilingual pre-trained text-to-text transformer:https://github.com/google-research/multilingual-t5 [10] Lcsts: A large scale chinese short text summarization dataset:https://arxiv.org/pdf/1506.05865.pdf [11] rouge:https://github.com/pltrdy/rouge [12] summarization:https://xiaosheng.run/ [13] Deep reinforcement and transfer learning for abstractive text summarization:https://www.sciencedirect.com/science/article/abs/pii/S0885230821000796 [14] Summarization Papers:https://github.com/xcfcode/Summarization-Papers “点赞”是喜欢,“在看、分享”是真爱from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_checkpoint = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
progress_bar = tqdm(range(len(dataloader)))
progress_bar.set_description(f'loss: {0:>7f}')
finish_batch_num = (epoch-1) * len(dataloader)
model.train()
for batch, batch_data in enumerate(dataloader, start=1):
batch_data = batch_data.to(device)
outputs = model(**batch_data)
loss = outputs.loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
lr_scheduler.step()
total_loss += loss.item()
progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
progress_bar.update(1)
return total_losspip install rouge
def test_loop(dataloader, model, tokenizer):
preds, labels = [], []
rouge = Rouge()
model.eval()
with torch.no_grad():
for batch_data in tqdm(dataloader):
batch_data = batch_data.to(device)
# 获取预测结果
generated_tokens = model.generate(batch_data["input_ids"],
attention_mask=batch_data["attention_mask"],
max_length=max_target_length,
num_beams=beam_search_size,
no_repeat_ngram_size=no_repeat_ngram_size,
).cpu().numpy()
if isinstance(generated_tokens, tuple):
generated_tokens = generated_tokens[0]
decoded_preds = tokenizer.batch_decode(generated_tokens,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
label_tokens = batch_data["labels"].cpu().numpy()
# 将标签序列中的 -100 替换为 pad token ID 以便于分词器解码
label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(label_tokens,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
# 处理为 rouge 库接受的文本列表格式
preds += [' '.join(pred.strip()) for pred in decoded_preds]
labels += [' '.join(label.strip()) for label in decoded_labels]
# rouge 库计算各项 ROUGE 值
scores = rouge.get_scores(hyps=preds, refs=labels, avg=True)
result = {key: value['f'] * 100 for key, value in scores.items()}
result['avg'] = np.mean(list(result.values()))
return resultget_scheduler()
函数定义学习率调度器。""" Train the model """
total_steps = len(train_dataloader) * num_train_epochs
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": weight_decay},
{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
]
warmup_steps = int(total_steps * warmup_proportion)
optimizer = AdamW(
optimizer_grouped_parameters,
lr=learning_rate,
betas=(adam_beta1, adam_beta2),
eps=adam_epsilon
)
lr_scheduler = get_scheduler(
'linear',
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps
)
# Train!
logger.info("***** Running training *****")
logger.info(f"Num examples - {len(train_data)}")
logger.info(f"Num Epochs - {num_train_epochs}")
logger.info(f"Total optimization steps - {total_steps}")
total_loss = 0.
best_avg_rouge = 0.
for epoch in range(num_train_epochs):
print(f"Epoch {epoch+1}/{num_train_epochs}n" + 30 * "-")
total_loss = train_loop(train_dataloader, model, optimizer, lr_scheduler, epoch, total_loss)
dev_rouges = test_loop(dev_dataloader, model, tokenizer)
logger.info(f"Dev Rouge1: {dev_rouges['rouge-1']:>0.2f} Rouge2: {dev_rouges['rouge-2']:>0.2f} RougeL: {dev_rouges['rouge-l']:>0.2f}")
rouge_avg = dev_rouges['avg']
if rouge_avg > best_avg_rouge:
best_avg_rouge = rouge_avg
logger.info(f'saving new weights to {output_dir}...n')
save_weight = f'epoch_{epoch+1}_rouge_{rouge_avg:0.4f}_weights.bin'
torch.save(model.state_dict(), os.path.join(output_dir, save_weight))
logger.info("Done!")Reference
<
Comments(0)
大佬!求源码