【论文复现】InstructRAG

模型结构:

参考项目:weizhepei/InstructRAG

模型和数据集:meng-lab/InstructRAG

1 安装

1.1 虚拟环境

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
conda create -n instrag python=3.10 -y
conda activate instrag

pip install numpy==1.26.4 vllm==0.4.1 accelerate pyserini ipykernel
python -m ipykernel install --user --name instrag
jupyter kernelspec list

conda install -c conda-forge faiss-cpu
pip install flash-attn==2.5.6 --no-build-isolation
pip install peft # LoRA 微调需要
```

> 注意 `pyserini` 需要 JDK 21:`conda install -c conda-forge openjdk=21 maven -y`,或者自行配置:
> ```bash
> sudo apt install openjdk-21-jdk
> export JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64
> export PATH=$JAVA_HOME/bin:$PATH
> java -version
> javac -version
BASH

当然也可以选择在具体的脚本中指定 Java 环境:

1
2
3
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-21-openjdk-amd64'
os.environ['PATH'] = os.path.join(os.environ['JAVA_HOME'], 'bin') + ':' + os.environ['PATH']
PYTHON

1.2 项目结构

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
dataset/ # 保存每个问答数据集的样例、训练集、测试集
eval_results/ # 保存测试集的答案和评估结果
records/ # 保存日志
saved_checkpoints/ # 保存 LoRA 微调的模型文件
saved_models/ # 保存完整的监督微调后的模型

src/ # 保存项目脚本
common_utils.py
data_utils.py
finetune.py
inference.py
log_utils.py
merge.py
metrics.py
rag.json

finetune.ipynb # 拆解 finetune.py
inference.ipynb # 拆解 inference.py
NIX

1.3 LLM

1
huggingface-cli download --token Your_token meta-llama/Meta-Llama-3-8B-Instruct --local-dir model/Llama-3-8B-Instruct
BASH

1.4 数据集

1.4.1 原始数据集

下载原始数据集:huggingface-cli download --token Your_token --repo-type dataset meng-lab/InstructRAG --local-dir dataset/origin

  1. ASQA:
1
2
3
4
5
6
7
8
9
10
11
{
"question": str,
"answers": str list, # 只有 1 个答案
"qa_pairs": [
{
"question": str,
"answers": str list, # 有多个答案
},
... # 有多个问答对
]
}
JSON
  1. 2WikiMultiHopQA & NaturalQuestions:
1
2
3
4
5
{
"question": str,
"answers": str list, # 只有 1 个答案
"qa_pairs": null
}
JSON
  1. PopQA & TriviaQA:
1
2
3
4
5
{
"question": str,
"answers": str list, # 有多个答案
"qa_pairs": null
}
JSON

1.4.2 生成理由的数据集

下载生成理由的数据集:google drive

  1. ASQA:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
{
"question": str,
"answers": str list, # 只有 1 个答案
"qa_pairs": [
{
"question": str,
"answers": str list, # 有多个答案
},
... # 有多个问答对
]
"rationale": str, # test.json 没有
"ctxs": [
{
"id": str,
"title": str,
"text": str,
"score": str,
"has_answer": str # 只有 test.json 有
}
... # 多个上下文
]
}
JSON
  1. 2WikiMultiHopQA & NaturalQuestions:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
{
"question": str,
"answers": str list, # 只有 1 个答案
"qa_pairs": null,
"rationale": str, # test.json 没有
"ctxs": [
{
"id": str,
"title": str,
"text": str,
"score": str,
"has_answer": bool
}
... # 多个上下文
]
}
JSON
  1. PopQA & TrivialQA:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
{
"question": str,
"answers": str list, # 有多个答案
"qa_pairs": null,
"rationale": str, # test.json 没有
"ctxs": [
{
"id": str,
"title": str,
"text": str,
"score": str
}
... # 多个上下文
]
}
JSON

1.4.3 答案召回率

执行以下命令来统计每个数据集的答案召回率:

1
2
3
4
5
6
DATASETS=("PopQA" "TriviaQA" "NaturalQuestions" "2WikiMultiHopQA" "ASQA")

for DATASET in "${DATASETS[@]}"; do
python recall.py dataset/preprocess/$DATASET/train.json >> output.txt
python recall.py dataset/preprocess/$DATASET/test.json >> output.txt
done
BASH

recall.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json
import re
import string
import argparse
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
from functools import partial


# 标准化答案文本
def normalize_answer(s):
def remove_articles(text): # 移除冠词(a, an, the)
return re.sub(r"\b(a|an|the)\b", " ", text)

def white_space_fix(text): # 修复多余的空格,确保单词之间只有一个空格
return " ".join(text.split())

def remove_punc(text): # 移除所有标点符号
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)

def lower(text): # 将文本转换为小写
return text.lower()

# 依次应用上述函数:小写化 -> 移除标点 -> 移除冠词 -> 修复空格
return white_space_fix(remove_articles(remove_punc(lower(s))))


# 读取JSON/JSONL文件
def read_json_data(file_path):
if file_path.endswith('.json'):
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
elif file_path.endswith('.jsonl'):
data = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line))
return data
else:
raise ValueError("不支持的文件格式")


# 初始化合并后的答案列表
def merge_answers(item):
merged_answers = []

# 添加主 answers 中的答案(如果存在且不为空)
if "answers" in item and item["answers"]:
merged_answers.extend(item["answers"])

# 添加 qa_pairs 中的答案(如果存在且不为空)
if "qa_pairs" in item and item["qa_pairs"]:
for qa_pair in item["qa_pairs"]:
if "answers" in qa_pair and qa_pair["answers"]:
merged_answers.extend(qa_pair["answers"])

return merged_answers


# 检查答案是否出现在任何段落中
def check_answer_in_contexts(item):
# 获取所有可能的答案
all_answers = merge_answers(item)
all_answers = [normalize_answer(ans) for ans in all_answers if ans.strip()]

# 如果没有任何答案,直接返回False
if not all_answers:
return False

# 合并所有段落的文本(title + text)
full_text = " ".join([
f"{p['title']} {p['text']}"
for p in item["ctxs"]
])
full_text = normalize_answer(full_text)

# 检查是否有任何一个答案出现在文本中
return any(ans in full_text for ans in all_answers if ans)


if __name__ == "__main__":
# 设置命令行参数解析
parser = argparse.ArgumentParser(description="统计答案在段落中的出现率")
parser.add_argument("input_file", type=str, help="输入的JSON/JSONL文件路径")
args = parser.parse_args()

# 读取数据
data = read_json_data(args.input_file)

# 使用多进程并行处理
with Pool(processes=cpu_count()) as pool:
# 使用partial固定check_answer_in_contexts函数
processor = partial(check_answer_in_contexts)
# 使用tqdm显示进度条
results = list(tqdm(pool.imap(processor, data), total=len(data), desc="处理进度"))

# 统计结果
total = len(results)
found = sum(results)
print(f"分析完成,共检查 {total} 个可回答的问题")
print(f"找到答案的问题数量: {found}")
print(f"答案出现率: {found / total:.2%}\n")
PYTHON

结果如下:

Dataset Subset Size Top-K Recall@K
PopQA Train 12868 5 64.38%
PopQA Test 1399 5 69.69%
TriviaQA Train 78785 5 75.13%
TriviaQA Test 11313 5 75.07%
NaturalQuestions Train 79168 5 78.05%
NaturalQuestions Test 3610 5 68.95%
2WikiMultiHopQA Train 167454 10 46.14%
2WikiMultiHopQA Test 12576 10 40.94%
ASQA Train 4353 5 81.14%
ASQA Test 948 5 83.23%

2 整体流程

2.1 使用自定义查询进行检索

使用带有预构建检索语料库(例如维基百科)索引的 castorini/pyserini

  1. 稀疏检索(BM25):
1
2
3
4
5
6
7
8
from pyserini.search.lucene import LuceneSearcher

# 使用 Wikipedia 转储作为检索源
searcher = LuceneSearcher.from_prebuilt_index('wikipedia-dpr')
# 检索与给定查询相关的文档
hits = searcher.search('who got the first nobel prize in physics')
# 显示检索到的文档和相关性分数
print(f'doc: {searcher.doc(hits[0].docid).raw()}\nscore: {hits[0].score}')
PYTHON

doc: {
    "id" : "628725",
    "contents" : "\"Nobel Prize in Physics\"\nreceive a diploma, a medal and a document confirming the prize amount. Nobel Prize in Physics The Nobel Prize in Physics () is a yearly award given by the Royal Swedish Academy of Sciences for those who have made the most outstanding contributions for mankind in the field of physics. It is one of the five Nobel Prizes established by the will of Alfred Nobel in 1895 and awarded since 1901; the others being the Nobel Prize in Chemistry, Nobel Prize in Literature, Nobel Peace Prize, and Nobel Prize in Physiology or Medicine. The first Nobel Prize in Physics was"
}
score: 16.33108901977539
TEXT
  1. 密集检索(DPR):
1
2
3
4
5
6
7
8
9
10
11
from pyserini.encode import DprQueryEncoder
from pyserini.search.faiss import FaissSearcher

# 加载查询编码器
encoder = DprQueryEncoder("facebook/dpr-question_encoder-single-nq-base")
# 使用 Wikipedia 转储 作为检索源
searcher = FaissSearcher.from_prebuilt_index('wikipedia-dpr-100w.dpr-single-nq', encoder)
# 检索与给定查询相关的文档
hits = searcher.search('who got the first nobel prize in physics')
# 显示检索到的文档和相关性分数
print(f'doc: {searcher.doc(hits[0].docid).raw()}\nscore: {hits[0].score}')
PYTHON

doc: {
    "id" : "284453",
    "contents" : "\"Nobel Prize\"\nA group including 42 Swedish writers, artists, and literary critics protested against this decision, having expected Leo Tolstoy to be awarded. Some, including Burton Feldman, have criticised this prize because they consider Prudhomme a mediocre poet. Feldman's explanation is that most of the Academy members preferred Victorian literature and thus selected a Victorian poet. The first Physiology or Medicine Prize went to the German physiologist and microbiologist Emil von Behring. During the 1890s, von Behring developed an antitoxin to treat diphtheria, which until then was causing thousands of deaths each year. The first Nobel Peace Prize went to the Swiss"
}
score: 82.00704956054688
TEXT

2.2 生成理由

1
2
3
4
5
6
7
8
DATASET=PopQA  # [PopQA, TriviaQA, NaturalQuestions, 2WikiMultiHopQA, ASQA]

CUDA_VISIBLE_DEVICES=0 python src/inference.py \
--dataset_name $DATASET \
--model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
--n_docs 5 \
--output_dir dataset/${DATASET}\
--do_rationale_generation \
BASH

2.3 监督微调

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
DATASET=PopQA # [PopQA, TriviaQA, NaturalQuestions, 2WikiMultiHopQA, ASQA]
PER_DEVICE_BATCH_SIZE=1
NUM_DEVICE=4
TOTAL_BATCH_SIZE=128
GRADIENT_ACC_STEPS=$(($TOTAL_BATCH_SIZE/$NUM_DEVICE/$PER_DEVICE_BATCH_SIZE))

CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=$NUM_DEVICE src/finetune.py \
--model_name_or_path ../model/Llama-3-8B-Instruct \
--dataset_name $DATASET \
--output_dir saved_checkpoints/InstructRAG-FT/${DATASET} \
--per_device_train_batch_size $PER_DEVICE_BATCH_SIZE \
--gradient_accumulation_steps $GRADIENT_ACC_STEPS \
--num_train_epochs 2 \
--n_docs 5 \
--learning_rate 2.5e-5 \
--lr_scheduler_type "cosine" \
--bf16 False \
--tf32 False \
--logging_steps 1 \
--weight_decay 0.0 \
--warmup_ratio 0.03 \
--seed 42 \
--model_max_length 4096 \
--ddp_timeout 1800 \
--fsdp "full_shard auto_wrap" \
--fsdp_transformer_layer_cls_to_wrap "LlamaDecoderLayer" \
--lora_enable \
--lora_r 16 \
--lora_alpha 32 \
--lora_dropout 0.05
BASH

2.4 合并 LoRA

1
2
3
4
DATASET=PopQA # [PopQA, TriviaQA, NaturalQuestions, 2WikiMultiHopQA, ASQA]
mkdir -p saved_models/InstructRAG-FT/$DATASET

python src/merge.py --base_model_path ../model/Llama-3-8B-Instruct --adapter_path saved_checkpoints/InstructRAG-FT/$DATASET --output_path saved_models/InstructRAG-FT/$DATASET
BASH

merge.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--base_model_path", type=str, required=True, help="原始模型路径")
parser.add_argument("--adapter_path", type=str, required=True, help="适配器文件夹路径")
parser.add_argument("--output_path", type=str, required=True, help="合并后的模型保存路径")
args = parser.parse_args()

base_model = AutoModelForCausalLM.from_pretrained(args.base_model_path) # 加载基础模型
lora_model = PeftModel.from_pretrained(base_model, args.adapter_path) # 加载 LoRA 适配器
merged_model = lora_model.merge_and_unload() # 合并模型

# 加载微调后的Tokenizer(包含任何新特殊标记)
tokenizer = AutoTokenizer.from_pretrained(args.adapter_path)

# 保存合并后的模型和Tokenizer
merged_model.save_pretrained(args.output_path)
tokenizer.save_pretrained(args.output_path)

print(f"合并后的模型已保存到:{args.output_path}")
PYTHON

2.5 推理和评估

1
2
3
4
5
6
7
8
9
DATASET=PopQA # [PopQA, TriviaQA, NaturalQuestions, 2WikiMultiHopQA, ASQA]
MODEL=InstructRAG-FT # [InstructRAG-FT, InstructRAG-ICL]

CUDA_VISIBLE_DEVICES=0 python src/inference.py \
--dataset_name $DATASET \
--rag_model $MODEL \
--n_docs 5 \
--output_dir eval_results/${MODEL}/${DATASET} \
--load_local_model
BASH

3 脚本分析

下面将以 ASQA 数据集的 demo.json 为例,拆解 finetune.pyinference.py,介绍整个项目的运行逻辑和数据流向。拆解过程中将 log_utils.py 换成了 print,便于查看当前状态。

3.1 finetune.py:监督微调

  1. 导入必要的包:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import torch
import transformers
import types
import dataclasses
import io
import os
import json
import copy
import numpy as np
from typing import Literal
from functools import partial
from dataclasses import dataclass, field
from transformers import Trainer, AutoModelForCausalLM
from peft import LoraConfig, TaskType, get_peft_model
from typing import Dict, Sequence, Union
PYTHON
  1. 定义模型参数,添加了 LoRA 微调功能:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
@dataclass
class ModelArguments:
model_name_or_path: str = field( # 模型名称或路径
default=None,
metadata={"help": "Name to a huggingface native pretrained model or path to a model on disk."}
)
lora_enable: bool = field( # 是否启用 LoRA 进行微调
default=False,
metadata={"help": "Whether to enable LoRA for fine-tuning."}
)
lora_r: int = field( # LoRA 的注意力维度
default=16,
metadata={"help": "LoRA attention dimension."}
)
lora_alpha: int = field( # LoRA 的 alpha 缩放因子
default=32,
metadata={"help": "LoRA alpha scaling."}
)
lora_dropout: float = field( # LoRA 的 dropout 概率
default=0.05,
metadata={"help": "LoRA dropout probability."}
)

model_args = ModelArguments(model_name_or_path="../model/Llama-3-8B-Instruct", lora_enable=True, lora_r=16, lora_alpha=32, lora_dropout=0.05)
PYTHON
  1. 定义数据参数:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
@dataclass
class DataArguments:
dataset_name: str = field( # 数据集名称
default=None,
metadata={
"help": "Name of the dataset to load."
}
)

prompt_dict_path: str = field( # 提示词字典路径
default="src/rag.json",
metadata={
"help": "Path to the dictionary for the prompt to format examples"
}
)

n_docs: int = field( # 每个示例检索的文档数量
default=5,
metadata={
"help": "Number of documents retrieved for each example."
}
)

data_args = DataArguments(dataset_name="ASQA")
PYTHON
  1. 定义训练参数:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
@dataclass
class TrainingArguments(transformers.TrainingArguments):
cache_dir: str = field(default=None) # 缓存目录
optim: str = field(default="adamw_torch") # 优化器
model_max_length: int = field( # 模型的最大序列长度
default=4096,
metadata={
"help": "Maximum sequence length. Sequences will be right padded to this length (and possibly truncated)."
"Enforcing a consistent max length ensures memory usage is constant and predictable."
}
)

padding: Literal["max_length", "longest"] = field( # 填充策略
default="longest",
metadata={
"help": "Padding strategy. If 'max_length', pads to `model_max_length` always; this might lead to some "
"redundant compute. If 'longest', pads to the longest sequence in the batch, capped by `model_max_length`."
}
)

resume_from_checkpoint: bool = field( # 是否从检查点恢复
default=False,
metadata={"help": "If True, loads from last check point."}
)

use_fast_tokenizer: bool = field( # 是否使用快速分词器
default=False,
metadata={
"help": "Use fast tokenizer if True. "
"Fast LLaMA tokenizer forces protobuf downgrade to 3.20.3. "
"Use fast tokenizer only if you can live with that."
}
)

PER_DEVICE_BATCH_SIZE=1 # 设置每个设备的批次大小为1
NUM_DEVICE=4 # 设置使用的设备数量为4
TOTAL_BATCH_SIZE=128 # 设置总批次大小为128
GRADIENT_ACC_STEPS=((TOTAL_BATCH_SIZE//NUM_DEVICE//PER_DEVICE_BATCH_SIZE)) # 梯度累积步数
training_args = TrainingArguments(
output_dir="./", # 输出目录
per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACC_STEPS,
num_train_epochs=2, # 训练总轮数
learning_rate=2.5e-5, # 学习率
lr_scheduler_type="cosine", # 学习率调度器
bf16=False, # 不启用bfloat16精度
tf32=False, # 不启用TensorFloat-32精度
logging_steps=1, # 每1步记录一次日志
weight_decay=0.0, # 权重衰减
warmup_ratio=0.03, # 预热步数占总训练步数的3%
seed=42, # 随机种子设为42
model_max_length=4096, # 模型最大输入长度
ddp_timeout=1800, # DDP(分布式数据并行)超时时间设为1800秒
fsdp="full_shard auto_wrap", # 使用完全分片数据并行(FSDP)策略,自动包装指定层
fsdp_transformer_layer_cls_to_wrap="LlamaDecoderLayer" # 指定要包装的Transformer层为LlamaDecoderLayer
)
PYTHON

TrainingArguments(output_dir='/', overwrite_output_dir=False, do_train=False, do_eval=False, do_predict=False, eval_strategy=<IntervalStrategy.NO: 'no'>, prediction_loss_only=False, per_device_train_batch_size=1, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=32, eval_accumulation_steps=None, eval_delay=0, torch_empty_cache_steps=None, learning_rate=2.5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=2, max_steps=-1, lr_scheduler_type=<SchedulerType.COSINE: 'cosine'>, lr_scheduler_kwargs={}, warmup_ratio=0.03, warmup_steps=0, log_level='passive', log_level_replica='warning', log_on_each_node=True, logging_dir='/runs/Apr10_12-14-05_89a604750497', logging_strategy=<IntervalStrategy.STEPS: 'steps'>, logging_first_step=False, logging_steps=1, logging_nan_inf_filter=True, save_strategy=<SaveStrategy.STEPS: 'steps'>, save_steps=500, save_total_limit=None, save_safetensors=True, save_on_each_node=False, save_only_model=False, restore_callback_states_from_checkpoint=False, no_cuda=False, use_cpu=False, use_mps_device=False, seed=42, data_seed=None, jit_mode_eval=False, use_ipex=False, bf16=False, fp16=False, fp16_opt_level='O1', half_precision_backend='auto', bf16_full_eval=False, fp16_full_eval=False, tf32=False, local_rank=0, ddp_backend=None, tpu_num_cores=None, tpu_metrics_debug=False, debug=[], dataloader_drop_last=False, eval_steps=None, dataloader_num_workers=0, dataloader_prefetch_factor=None, past_index=-1, run_name='/', disable_tqdm=False, remove_unused_columns=True, label_names=None, load_best_model_at_end=False, metric_for_best_model=None, greater_is_better=None, ignore_data_skip=False, fsdp=[<FSDPOption.FULL_SHARD: 'full_shard'>, <FSDPOption.AUTO_WRAP: 'auto_wrap'>], fsdp_min_num_params=0, fsdp_config={'min_num_params': 0, 'transformer_layer_cls_to_wrap': ['LlamaDecoderLayer'], 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, fsdp_transformer_layer_cls_to_wrap='LlamaDecoderLayer', accelerator_config=AcceleratorConfig(split_batches=False, dispatch_batches=None, even_batches=True, use_seedable_sampler=True, non_blocking=False, gradient_accumulation_kwargs=None, use_configured_state=False), deepspeed=None, label_smoothing_factor=0.0, optim=<OptimizerNames.ADAMW_TORCH: 'adamw_torch'>, optim_args=None, adafactor=False, group_by_length=False, length_column_name='length', report_to=[], ddp_find_unused_parameters=None, ddp_bucket_cap_mb=None, ddp_broadcast_buffers=None, dataloader_pin_memory=True, dataloader_persistent_workers=False, skip_memory_metrics=True, use_legacy_prediction_loop=False, push_to_hub=False, resume_from_checkpoint=False, hub_model_id=None, hub_strategy=<HubStrategy.EVERY_SAVE: 'every_save'>, hub_token=None, hub_private_repo=None, hub_always_push=False, gradient_checkpointing=False, gradient_checkpointing_kwargs=None, include_inputs_for_metrics=False, include_for_metrics=[], eval_do_concat_batches=True, fp16_backend='auto', evaluation_strategy=None, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=None, mp_parameters='', auto_find_batch_size=False, full_determinism=False, torchdynamo=None, ray_scope='last', ddp_timeout=1800, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, dispatch_batches=None, split_batches=None, include_tokens_per_second=False, include_num_input_tokens_seen=False, neftune_noise_alpha=None, optim_target_modules=None, batch_eval_metrics=False, eval_on_start=False, use_liger_kernel=False, eval_use_gather_object=False, average_tokens_across_devices=False, cache_dir=None, model_max_length=4096, padding='longest', use_fast_tokenizer=False)
TEXT
  1. 加载 common_utils.py 中的函数:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# 分布式环境中对象创建可能非常消耗内存,这个类在奇数和偶数rank上交错创建对象,避免所有对象同时创建,假设local_rank == -1表示不使用分布式训练
class staggered_object_creation(object):
def __init__(self, local_rank: int, world_size: int):
super().__init__()
self.local_rank = local_rank # 当前rank
self.world_size = world_size # 总rank数

# 进入上下文时偶数rank等待
def __enter__(self, *args, **kwargs):
del args, kwargs
if self.world_size > 1 and self.local_rank % 2 == 0:
dist.barrier() # 分布式屏障
return self

# 退出上下文时奇数rank先完成,然后所有rank同步
def __exit__(self, *args, **kwargs):
del args, kwargs
if self.world_size > 1:
if self.local_rank % 2 == 1:
dist.barrier()
dist.barrier() # 最终安全屏障

# 装饰器用法
def __call__(self, func):
def decorator(*args, **kwargs):
with self:
return func(*args, **kwargs)
return decorator

# 修改模型的zero_grad方法以节省内存
def let_model_save_mem_when_zero_grad(model: torch.nn.Module):
# 将所有模型参数的梯度设置为零或None
def new_zero_grad(self, set_to_none: bool = True) -> None:
if getattr(self, "_is_replica", False):
warnings.warn(
"Calling .zero_grad() from a module created with nn.DataParallel() has no effect. "
"The parameters are copied (in a differentiable manner) from the original module. "
"This means they are not leaf nodes in autograd and so don't accumulate gradients. "
"If you need gradients in your forward method, consider using autograd.grad instead."
)

for p in self.parameters():
if p.grad is not None:
if set_to_none: # 设置为None更节省内存
p.grad = None
else:
if p.grad.grad_fn is not None:
p.grad.detach_()
else:
p.grad.requires_grad_(False)
p.grad.zero_()

# 动态修改模型的zero_grad方法
model.zero_grad = types.MethodType(new_zero_grad, model)
return model

# 确保文件对象是可读的IO基类
def _make_r_io_base(f, mode: str):
if not isinstance(f, io.IOBase):
f = open(f, mode=mode)
return f

# 加载JSON文件到字典
def jload(f, mode="r"):
f = _make_r_io_base(f, mode) # 确保文件可读
jdict = json.load(f) # 加载JSON
f.close()
return jdict

# 收集状态字典并保存到磁盘
def safe_save_model_for_hf_trainer(
trainer: transformers.Trainer, output_dir: str, give_rw_access=True, rank0_only=True
):
now = time.perf_counter() # 记录开始时间

# 处理FSDP (完全分片数据并行) 情况
if trainer.is_fsdp_enabled:
# 配置完全状态字典,可选的offload到CPU和rank0_only
cfg = FullStateDictConfig(offload_to_cpu=True, rank0_only=rank0_only)
with FSDP.state_dict_type(trainer.model, StateDictType.FULL_STATE_DICT, cfg):
state_dict = trainer.model.state_dict()
if trainer.args.should_save: # 只有应该保存的rank才保存
trainer._save(output_dir, state_dict=state_dict) # 保存模型

# 处理DeepSpeed情况
elif trainer.is_deepspeed_enabled:
if trainer.args.should_save:
trainer._save(output_dir) # DeepSpeed有自己的保存逻辑

else: # 非FSDP和非DeepSpeed情况
state_dict = trainer.model.state_dict()
if trainer.args.should_save:
cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()} # 转移到CPU
del state_dict # 删除原状态字典节省内存
trainer._save(output_dir, state_dict=cpu_state_dict) # 保存CPU上的状态字典

if trainer.args.should_save:
if give_rw_access: # 可选设置目录读写权限
try:
os.system(f"chmod -R a+xwr {output_dir}")
except Exception as e:
print(f"Failed to give read-write access to {output_dir}: {e}")
print(f"Saving model took {time.perf_counter() - now:.2f} seconds.") # 记录保存耗时
PYTHON
  1. 创建上下文管理器:
1
2
3
ctx_mgr = staggered_object_creation(
local_rank=training_args.local_rank, world_size=training_args.world_size
)
PYTHON
  1. 使用上下文管理器:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
with ctx_mgr:
# 定义模型的参数
model_kwargs = dict(
### V100 不能使用 flash_attention
# attn_implementation="flash_attention_2",
###
config=transformers.AutoConfig.from_pretrained(model_args.model_name_or_path), # 从预训练模型加载配置
cache_dir=training_args.cache_dir, # 缓存目录
low_cpu_mem_usage=True, # 低 CPU 内存使用
device_map={"": training_args.device.index}, # 设备映射
)

# 加载预训练模型
model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)
# 调用函数优化模型在零梯度时的内存使用
let_model_save_mem_when_zero_grad(model)
PYTHON

Loading checkpoint shards: 100%|██████████| 4/4 [00:36<00:00,  9.21s/it]
TEXT
  1. 加载 tokenizer,设置填充策略:
1
2
3
4
5
6
7
8
9
10
11
12
13
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length,
padding_side="right",
truncation_side="left",
use_fast=training_args.use_fast_tokenizer,
)

tokenizer.padding = training_args.padding
# 如果分词器没有 pad_token,则将其设置为 eos_token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
PYTHON
  1. 使用 LoRA:
1
2
3
4
5
6
7
8
9
10
11
12
13
if model_args.lora_enable:
# 创建 LoRA 配置
lora_config = LoraConfig(
r=model_args.lora_r,
lora_alpha=model_args.lora_alpha,
lora_dropout=model_args.lora_dropout,
target_modules=["q_proj", "v_proj"], # 目标模块
task_type=TaskType.CAUSAL_LM, # 任务类型为因果语言模型
)
# 获取 LoRA 模型
model = get_peft_model(model, lora_config)
# 打印可训练参数
model.print_trainable_parameters()
PYTHON

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848
TEXT
  1. 加载 data_utils.py 中的函数:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
IGNORE_INDEX = -100 # 定义忽略索引,用于标记不需要计算损失的token

# 监督微调数据集类,继承自PyTorch的Dataset
class SFTDataset(torch.utils.data.Dataset):
def __init__(
self,
data_list: list[dict],
prompt_dict: dict,
tokenizer: transformers.PreTrainedTokenizer,
n_docs: int,
):
super(SFTDataset, self).__init__()

# 预处理数据,生成RAG格式的输入
sft_data = preprocess_for_rag(data_list=data_list, prompt_dict=prompt_dict, tokenizer=tokenizer, n_docs=n_docs)

self.input_ids = sft_data["input_ids"] # 输入ID
self.labels = sft_data["labels"] # 标签

self.metadata = sft_data["metadata"] # 元数据
self.tokenization_metadata = sft_data["tokenization_metadata"] # 分词元数据

# 返回数据集大小
def __len__(self):
return len(self.input_ids)

# 获取单个样本
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
return dict(input_ids=self.input_ids[i], labels=self.labels[i])

# 用于SFT数据集的数据整理器
@dataclasses.dataclass
class DataCollatorForSFTDataset(object):
tokenizer: transformers.PreTrainedTokenizer # 分词器

# 对输入ID和标签进行填充,使其长度一致
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:

input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
input_ids = torch.nn.utils.rnn.pad_sequence(
input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
)
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
attention_mask = input_ids.ne(self.tokenizer.pad_token_id).long() # 生成注意力掩码

return dict(
input_ids=input_ids,
labels=labels,
attention_mask=attention_mask,
)

# 创建监督学习数据
def make_supervised_data(
tokenizer: transformers.PreTrainedTokenizer,
data_args,
):

prompt_dict = jload(data_args.prompt_dict_path) # 加载提示模板

data_path = os.path.join('dataset', data_args.dataset_name, 'demo.json') # 数据路径由 train.json 改为了 demo.json,只有一组数据
print(f"Loading training set from: {data_path}")
data_list = jload(data_path) # 加载训练数据

# 创建SFT数据集
train_dataset = SFTDataset(
data_list=data_list,
prompt_dict=prompt_dict,
tokenizer=tokenizer,
n_docs=data_args.n_docs,
)

data_collator = DataCollatorForSFTDataset(tokenizer=tokenizer) # 创建数据整理器
return dict(train_dataset=train_dataset, data_collator=data_collator) # 返回数据集和数据整理器

# 规范化问题:确保问题以问号结尾且开头格式正确
def normalize_question(question):
if not question.endswith("?"):
question = question + "?"
if question.startswith("."): # 检查是否以 "." 开头
question = question.lstrip(". ") # 移除开头的 "." 和空格

return question[0].lower() + question[1:] # 首字母小写

# 构建上下文文档:保证文档得分升序排列
def build_contexts(example, n_docs):
if len(example["ctxs"]) > 0 and example["ctxs"][0]["score"] > example["ctxs"][1]["score"]: # 如果第一个文档分数更高,则反转列表
ctxs_list = example["ctxs"][:n_docs][::-1]
else: # 否则直接取前n_docs个文档
ctxs_list = example["ctxs"][:n_docs]

# 格式化文档文本
docs_text = "\n\n".join([f"Document {idx+1} (Title: {ctx['title']}): {ctx['text']}" for idx, ctx in enumerate(ctxs_list)])
doc_prompt = f"{docs_text}\n\n"

return doc_prompt

# 预处理数据,生成RAG格式的输入
def preprocess_for_rag(
data_list: list[dict],
prompt_dict: dict,
tokenizer: transformers.PreTrainedTokenizer,
n_docs: int,
verbose=True,
) -> dict[str, Union[torch.Tensor, Sequence[torch.Tensor]]]:

sources = [] # 源文本
targets = [] # 目标文本

assistant_prefix = prompt_dict['assistant_prefix'] # 助手前缀
assist_prefix_len = len(tokenizer.encode(assistant_prefix, add_special_tokens=False, return_tensors="pt")[0]) # 前缀长度

user_prefix = prompt_dict['user_prefix'] # 用户前缀
user_prefix_id = tokenizer.encode(user_prefix, add_special_tokens=True, return_tensors="pt")[0] # 用户前缀ID
user_prefix_len = len(user_prefix_id) # 用户前缀长度

for sample in data_list:
query_prompt = prompt_dict['query_prompt'] + normalize_question(sample['question']) # 查询提示
doc_prompt = build_contexts(sample, n_docs=n_docs) # 文档提示
sources.append(doc_prompt + query_prompt) # 组合源文本

target_prompt = assistant_prefix + sample['rationale'] + tokenizer.eos_token # 目标文本
targets.append(target_prompt)

examples = [s + t for s, t in zip(sources, targets)] # 组合源和目标
examples_tokenized = _tokenize_fn(examples, tokenizer, max_len_offset = [user_prefix_len] * len(examples), add_special_tokens=False) # 分词

input_ids = [torch.cat([user_prefix_id, ctx]) for ctx in examples_tokenized["input_ids"]] # 组合输入ID
targets_tokenized = _tokenize_fn(targets, tokenizer, add_special_tokens=False) # 目标分词

labels = copy.deepcopy(input_ids) # 深拷贝输入ID作为标签

for idx, label in enumerate(labels):
target_len = len(targets_tokenized["input_ids"][idx]) # 目标长度

if idx == 0:
logger.warning(f'\n===DEBUG Input:\n{json.dumps(tokenizer.decode(label))}===') # 调试输入
logger.warning(f'\n===DEBUG Target:\n{label[-(target_len - assist_prefix_len):]} ==> {json.dumps(tokenizer.decode(label[-(target_len - assist_prefix_len):]))}===') # 调试目标

assert torch.all(labels[idx][-(target_len-assist_prefix_len):].eq(targets_tokenized["input_ids"][idx][assist_prefix_len:])) # 验证目标一致

label[:-(target_len - assist_prefix_len)] = IGNORE_INDEX # 忽略不需要计算损失的部分

packaged_data = dict(
input_ids=input_ids,
labels=labels,
metadata=dict(),
tokenization_metadata=examples_tokenized["tokenization_metadata"],
)

if verbose:
logger.warning(f"Tokenization metadata:\n{json.dumps(packaged_data['tokenization_metadata'])}") # 输出分词元数据

return packaged_data

# 分词函数
def _tokenize_text(x, tokenizer, padding, add_special_tokens):
tokenized = tokenizer(
text=x,
return_tensors="pt",
padding=padding,
max_length=tokenizer.model_max_length,
truncation=True,
add_special_tokens=add_special_tokens,
)
return tokenized

# 带偏移的分词函数
def _tokenize_text_with_offset(x, tokenizer, padding, add_special_tokens):
tokenized = tokenizer(
text=x[0],
return_tensors="pt",
padding=padding,
max_length=tokenizer.model_max_length - x[1], # 考虑偏移
truncation=True,
add_special_tokens=add_special_tokens,
)
return tokenized

# 分词函数,处理字符串列表
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer, max_len_offset=None, add_special_tokens=True) -> dict:
padding = getattr(tokenizer, "padding", "longest") # 获取填充方式
if max_len_offset is not None:
tokenized_list = list(
map(
partial(_tokenize_text_with_offset, tokenizer=tokenizer, padding=padding, add_special_tokens=add_special_tokens),
zip(strings, max_len_offset), # 组合字符串和偏移
)
)
else:
tokenized_list = list(
map(
partial(_tokenize_text, tokenizer=tokenizer, padding=padding, add_special_tokens=add_special_tokens),
strings,
)
)

input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list] # 输入ID和标签
input_ids_lens = labels_lens = [
tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list # 计算非填充长度
]

return dict(
input_ids=input_ids,
labels=labels,
input_ids_lens=input_ids_lens,
labels_lens=labels_lens,
tokenization_metadata=dict( # 分词元数据
num_examples=len(tokenized_list),
input_ids_avg_len=np.mean(input_ids_lens),
input_ids_max_len=max(input_ids_lens),
input_ids_min_len=min(input_ids_lens),
labels_avg_len=np.mean(labels_lens),
labels_max_len=max(labels_lens),
labels_min_len=min(labels_lens),
model_max_length=tokenizer.model_max_length,
),
)
PYTHON
  1. 创建数据模块:
1
2
3
4
data_module: dict = make_supervised_data(
tokenizer=tokenizer,
data_args=data_args,
)
PYTHON

Loading training set from: dataset/ASQA/demos.json

===DEBUG Input:
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nDocument 1 (Title: Nathan's Hot Dog Eating Contest): competitive eating by downing 50 hot dogs\u2014smashing the previous record of 25.5. The Japanese eater introduced advanced eating and training techniques that shattered previous competitive eating world records. The rise in popularity of the event coincided with the surge in popularity of the worldwide competitive eating circuit. On July 4, 2011, Sonya Thomas became the champion of the first Nathan's Hot Dog Eating Contest for Women. Previously, women and men had competed against each other, except for one all-female Memorial Day competition held in 1975. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pepto-Bismol-sponsored pink belt and\n\nDocument 2 (Title: Nathan's Hot Dog Eating Contest): Island most years since about 1972, usually in conjunction with Independence Day. Nathan's promoter Mortimer Matz claimed that on July 4, 1916, four immigrants held a hot dog eating contest at Nathan's Famous stand on Coney Island to settle an argument about who was the most patriotic. He also made the spurious claim that the contest has been held each year since then except 1941 (\"as a protest to the war in Europe\") and 1971 (as a protest to political unrest in the U.S.). A man by the name of Jim Mullen is said to have won the first contest,\n\nDocument 3 (Title: Nathan's Hot Dog Eating Contest): called to the stage individually during introductions. In 2013, six-time defending champion Joey Chestnut was escorted to the stage in a sedan chair. The competition draws many spectators and worldwide press coverage. In 2007, an estimated 50,000 came out to witness the event. In 2004 a three-story-high \"Hot Dog Eating Wall of Fame\" was erected at the site of the annual contest. The wall lists past winners, and has a digital clock which counts down the minutes until the next contest. Despite substantial damage suffered at Nathan's due to Hurricane Sandy in October 2012, the location was repaired, reopened, and\n\nDocument 4 (Title: Nathan's Hot Dog Eating Contest): Nathan's Hot Dog Eating Contest The Nathan's Hot Dog Eating Contest is an annual American hot dog competitive eating competition. It is held each year on Independence Day at Nathan's Famous Corporation's original, and best-known restaurant at the corner of Surf and Stillwell Avenues in Coney Island, a neighborhood of Brooklyn, New York City. The contest has gained public attention in recent years due to the stardom of Takeru Kobayashi and Joey Chestnut. The defending men's champion is Joey Chestnut, who ate 74 hot dogs in the 2018 contest. The defending women's champion is Miki Sudo, who ate 37 hot\n\nDocument 5 (Title: Sonya Thomas): at the time. The only eaters besides Kobayashi to defeat Thomas between the 2004 and 2005 Nathan's contests were Bill \"El Wingador\" Simmons in the controversial 2005 Wing Bowl, and Dale Boone, who won a contest eating overheated baked beans after he doused the beans in water to cool them down. On July 4, 2011, Thomas became the first champion of Nathan's Hot Dog Eating Contest for Women. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pink Belt and won $10,000. Thomas had won 1st place in the National Buffalo Wing Festival US chicken wing eating championship\n\nBased on your knowledge and the provided information, answer the question:\n{question}who won nathan's hot dog eating contest?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe documents that are useful to answer the question \"Who won Nathan's hot dog eating contest?\" are:\n\n* Document 1: Nathan's Hot Dog Eating Contest\n* Document 5: Sonya Thomas\n\nDocument 1 provides information about the history of the contest, including the fact that Sonya Thomas became the champion of the first Nathan's Hot Dog Eating Contest for Women in 2011.\n\nDocument 5 provides more information about Sonya Thomas, including her achievements in competitive eating, including winning the Nathan's Hot Dog Eating Contest for Women in 2011.\n\nBased on these documents, we can conclude that Sonya Thomas won the Nathan's Hot Dog Eating Contest for Women in 2011.\n\nHowever, the provided answer mentions the champions of the 2021 edition, which is not mentioned in the provided documents. Therefore, we cannot determine the winners of the contest for years after 2011 based on the provided information.\n\nIn conclusion, the answer to the question \"Who won Nathan's hot dog eating contest?\" is Sonya Thomas, who won the Nathan's Hot Dog Eating Contest for Women in 2011.<|eot_id|>"===

===DEBUG Target:
tensor([   791,   9477,    430,    527,   5505,    311,   4320,    279,   3488,
       330,  15546,   2834,  37837,    596,   4106,   5679,  12459,  14130,
      7673,    527,   1473,      9,  12051,    220,     16,     25,  37837,
       596,   8166,  14588,  60638,  47633,    198,      9,  12051,    220,
        20,     25,  21533,     64,  11355,    271,   7676,    220,     16,
      5825,   2038,    922,    279,   3925,    315,    279,  14130,     11,
      2737,    279,   2144,    430,  21533,     64,  11355,   6244,    279,
     18824,    315,    279,   1176,  37837,    596,   8166,  14588,  60638,
     47633,    369,  11215,    304,    220,    679,     16,    382,   7676,
       220,     20,   5825,    810,   2038,    922,  21533,     64,  11355,
        11,   2737,   1077,  33997,    304,  15022,  12459,     11,   2737,
     11230,    279,  37837,    596,   8166,  14588,  60638,  47633,    369,
     11215,    304,    220,    679,     16,    382,  29815,    389,   1521,
      9477,     11,    584,    649,  32194,    430,  21533,     64,  11355,
      2834,    279,  37837,    596,   8166,  14588,  60638,  47633,    369,
     11215,    304,    220,    679,     16,    382,  11458,     11,    279,
      3984,   4320,  34945,    279,  34838,    315,    279,    220,   2366,
        16,  14002,     11,    902,    374,    539,   9932,    304,    279,
      3984,   9477,     13,  15636,     11,    584,   4250,   8417,    279,
     26526,    315,    279,  14130,    369,   1667,   1306,    220,    679,
        16,   3196,    389,    279,   3984,   2038,    382,    644,  17102,
        11,    279,   4320,    311,    279,   3488,    330,  15546,   2834,
     37837,    596,   4106,   5679,  12459,  14130,   7673,    374,  21533,
        64,  11355,     11,    889,   2834,    279,  37837,    596,   8166,
     14588,  60638,  47633,    369,  11215,    304,    220,    679,     16,
        13, 128009]) ==> "The documents that are useful to answer the question \"Who won Nathan's hot dog eating contest?\" are:\n\n* Document 1: Nathan's Hot Dog Eating Contest\n* Document 5: Sonya Thomas\n\nDocument 1 provides information about the history of the contest, including the fact that Sonya Thomas became the champion of the first Nathan's Hot Dog Eating Contest for Women in 2011.\n\nDocument 5 provides more information about Sonya Thomas, including her achievements in competitive eating, including winning the Nathan's Hot Dog Eating Contest for Women in 2011.\n\nBased on these documents, we can conclude that Sonya Thomas won the Nathan's Hot Dog Eating Contest for Women in 2011.\n\nHowever, the provided answer mentions the champions of the 2021 edition, which is not mentioned in the provided documents. Therefore, we cannot determine the winners of the contest for years after 2011 based on the provided information.\n\nIn conclusion, the answer to the question \"Who won Nathan's hot dog eating contest?\" is Sonya Thomas, who won the Nathan's Hot Dog Eating Contest for Women in 2011.<|eot_id|>"===
Tokenization metadata:
{"num_examples": 2, "input_ids_avg_len": 1099.5, "input_ids_max_len": 1216, "input_ids_min_len": 983, "labels_avg_len": 1099.5, "labels_max_len": 1216, "labels_min_len": 983, "model_max_length": 4096}
TEXT
  1. 创建训练器:
1
2
3
4
5
6
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
**data_module,
)
PYTHON
  1. 开始训练:
1
2
3
4
5
6
trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
# 训练完成后记录日志
print("hooray! training finished successfully!\nNow on to model saving -- With mixed precision, FSDP will upcast in the model preparation step, and FSDP will then save checkpoints in the upcasted precision. See: https://huggingface.co/docs/accelerate/en/concept_guides/fsdp_and_deepspeed")
trainer.save_state() # 保存训练状态
safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) # 安全地保存模型
print("hooray again! model saving worked.")
PYTHON

3.2 inference.py:推理与评估

  1. 导入必要的包:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
import sys
import io
import os
import json
import transformers
import copy
import torch
import functools
import string
import re
import numpy as np
from vllm import LLM, SamplingParams
from tqdm import tqdm
from typing import Union
PYTHON
  1. 加载 common_utils.py 中的函数:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
makedirs = functools.partial(os.makedirs, exist_ok=True)

# 确保文件对象是可读的IO基类
def _make_r_io_base(f, mode: str):
if not isinstance(f, io.IOBase):
f = open(f, mode=mode)
return f

# 确保文件对象是可写的IO基类,必要时创建目录
def _make_w_io_base(f, mode: str):
if not isinstance(f, io.IOBase):
f_dirname = os.path.dirname(f)
if f_dirname != "":
makedirs(f_dirname) # 创建父目录
f = open(f, mode=mode)
return f

# 加载JSON文件到字典
def jload(f, mode="r"):
f = _make_r_io_base(f, mode) # 确保文件可读
jdict = json.load(f) # 加载JSON
f.close()
return jdict

# 将对象以JSON格式写入文件
def jdump(obj: Union[str, dict, list], f, mode="w", indent=4, default=str):
f = _make_w_io_base(f, mode) # 确保文件可写
if isinstance(obj, (dict, list)):
json.dump(obj, f, indent=indent, default=default)
elif isinstance(obj, str):
f.write(obj)
else:
raise ValueError(f"Unexpected type: {type(obj)}")
f.close()
PYTHON
  1. 模拟 parser 的命令行解析,只拆解 FT 模型,ICL 逻辑类似:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
class Args:
pass

args = Args()

args.dataset_name = "ASQA" # 数据集名称
args.rag_model = "InstructRAG-FT" # RAG模型类型
args.model_name_or_path = "meta-llama/Meta-Llama-3-8B-Instruct" # 模型名称或路径
args.load_local_model = True # 是否加载本地模型
args.do_rationale_generation = False # 是否生成理由
args.n_docs = 5 # 检索文档数量
args.output_dir = "./" # 输出目录
args.cache_dir = None # 模型缓存目录
args.prompt_dict_path = "src/rag.json" # 提示词模板路径
args.temperature = 0 # 采样温度
args.max_tokens = 4096 # 最大token数
args.seed = 42 # 随机种子
args.max_instances =sys.maxsize # 最大实例数
PYTHON
  1. do_rationale_generationeval_model 这两个函数的逻辑是一致的,只不过前者用于生成训练数据的理由,后者用于生成测试数据的答案,因此只拆解 eval_model
1
2
3
4
5
# 数据路径由 test.json 改成了 demo.json,只有一组数据
data_path = f'dataset/{args.dataset_name}/demo.json'

test_data = jload(data_path)[:args.max_instances]
print(json.dumps(test_data, ensure_ascii=False, indent=4))
PYTHON

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
[
{
"question": "Who won nathan's hot dog eating contest?",
"answers": [
"The Nathan's Hot Dog Eating Contest is an annual American hot dog eating competition held on Independence Day at Nathan's Famous Corporation's original restaurant at the corner of Surf and Stillwell Avenues in Coney Island, a neighborhood of Brooklyn, New York City. The current men's and women's competitions champions are Joey Chestnut and Michelle Lesco, who crowned themselves in the 2021 edition. Previously, Miki Sudo had won the women's competition every year from 2014-2020, with Chestnut doing so in the men's variant in 2017 and 2016 and Matt Stonie in 2015."
],
"qa_pairs": [
{
"question": "Who won the men's competition of Nathan's Hot Dog Eating Contest in 2017?",
"answers": [
"Joey Chestnut"
]
},
{
"question": "Who won the women's competition of Nathan's Hot Dog Eating Contest in 2017?",
"answers": [
"Miki Sudo"
]
},
{
"question": "Who won the men's competition of Nathan's Hot Dog Eating Contest in 2016?",
"answers": [
"Joey Chestnut"
]
},
{
"question": "Who won the women's competition of Nathan's Hot Dog Eating Contest in 2016?",
"answers": [
"Miki Sudo"
]
},
{
"question": "Who won the men's competition of Nathan's Hot Dog Eating Contest in 2015?",
"answers": [
"Matt Stonie"
]
},
{
"question": "Who won the women's competition of Nathan's Hot Dog Eating Contest in 2015?",
"answers": [
"Miki Sudo"
]
}
],
"rationale": "The documents that are useful to answer the question \"Who won Nathan's hot dog eating contest?\" are:\n\n* Document 1: Nathan's Hot Dog Eating Contest\n* Document 5: Sonya Thomas\n\nDocument 1 provides information about the history of the contest, including the fact that Sonya Thomas became the champion of the first Nathan's Hot Dog Eating Contest for Women in 2011.\n\nDocument 5 provides more information about Sonya Thomas, including her achievements in competitive eating, including winning the Nathan's Hot Dog Eating Contest for Women in 2011.\n\nBased on these documents, we can conclude that Sonya Thomas won the Nathan's Hot Dog Eating Contest for Women in 2011.\n\nHowever, the provided answer mentions the champions of the 2021 edition, which is not mentioned in the provided documents. Therefore, we cannot determine the winners of the contest for years after 2011 based on the provided information.\n\nIn conclusion, the answer to the question \"Who won Nathan's hot dog eating contest?\" is Sonya Thomas, who won the Nathan's Hot Dog Eating Contest for Women in 2011.",
"ctxs": [
{
"id": "3360010",
"title": "Nathan's Hot Dog Eating Contest",
"text": "competitive eating by downing 50 hot dogs—smashing the previous record of 25.5. The Japanese eater introduced advanced eating and training techniques that shattered previous competitive eating world records. The rise in popularity of the event coincided with the surge in popularity of the worldwide competitive eating circuit. On July 4, 2011, Sonya Thomas became the champion of the first Nathan's Hot Dog Eating Contest for Women. Previously, women and men had competed against each other, except for one all-female Memorial Day competition held in 1975. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pepto-Bismol-sponsored pink belt and",
"score": 0.7802734375
},
{
"id": "3360007",
"title": "Nathan's Hot Dog Eating Contest",
"text": "Island most years since about 1972, usually in conjunction with Independence Day. Nathan's promoter Mortimer Matz claimed that on July 4, 1916, four immigrants held a hot dog eating contest at Nathan's Famous stand on Coney Island to settle an argument about who was the most patriotic. He also made the spurious claim that the contest has been held each year since then except 1941 (\"as a protest to the war in Europe\") and 1971 (as a protest to political unrest in the U.S.). A man by the name of Jim Mullen is said to have won the first contest,",
"score": 0.7802734375
},
{
"id": "3360012",
"title": "Nathan's Hot Dog Eating Contest",
"text": "called to the stage individually during introductions. In 2013, six-time defending champion Joey Chestnut was escorted to the stage in a sedan chair. The competition draws many spectators and worldwide press coverage. In 2007, an estimated 50,000 came out to witness the event. In 2004 a three-story-high \"Hot Dog Eating Wall of Fame\" was erected at the site of the annual contest. The wall lists past winners, and has a digital clock which counts down the minutes until the next contest. Despite substantial damage suffered at Nathan's due to Hurricane Sandy in October 2012, the location was repaired, reopened, and",
"score": 0.7958984375
},
{
"id": "3360002",
"title": "Nathan's Hot Dog Eating Contest",
"text": "Nathan's Hot Dog Eating Contest The Nathan's Hot Dog Eating Contest is an annual American hot dog competitive eating competition. It is held each year on Independence Day at Nathan's Famous Corporation's original, and best-known restaurant at the corner of Surf and Stillwell Avenues in Coney Island, a neighborhood of Brooklyn, New York City. The contest has gained public attention in recent years due to the stardom of Takeru Kobayashi and Joey Chestnut. The defending men's champion is Joey Chestnut, who ate 74 hot dogs in the 2018 contest. The defending women's champion is Miki Sudo, who ate 37 hot",
"score": 0.7978515625
},
{
"id": "3425375",
"title": "Sonya Thomas",
"text": "at the time. The only eaters besides Kobayashi to defeat Thomas between the 2004 and 2005 Nathan's contests were Bill \"El Wingador\" Simmons in the controversial 2005 Wing Bowl, and Dale Boone, who won a contest eating overheated baked beans after he doused the beans in water to cool them down. On July 4, 2011, Thomas became the first champion of Nathan's Hot Dog Eating Contest for Women. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pink Belt and won $10,000. Thomas had won 1st place in the National Buffalo Wing Festival US chicken wing eating championship",
"score": 0.8037109375
}
]
}
]
JSON
  1. 加载 LLM 和 tokenizer:
1
2
3
4
5
6
7
8
9
10
11
12
if args.rag_model == 'InstructRAG-FT':
demos = []
if args.load_local_model: # 本地
llm = LLM(model=f'saved_models/InstructRAG-FT/{args.dataset_name}', max_model_len=args.max_tokens)
else: # 联网
llm = LLM(model=f'meng-lab/{args.dataset_name}-InstructRAG-FT', download_dir=args.cache_dir, max_model_len=args.max_tokens)

elif args.rag_model == 'InstructRAG-ICL':
demos = jload(f'dataset/{args.dataset_name}/demos.json')
llm = LLM(model='meta-llama/Meta-Llama-3-8B-Instruct', download_dir=args.cache_dir, max_model_len=args.max_tokens)

tokenizer = llm.get_tokenizer()
PYTHON

INFO 04-10 11:21:45 llm_engine.py:98] Initializing an LLM engine (v0.4.1) with config: model='saved_models/InstructRAG-FT/ASQA', speculative_config=None, tokenizer='saved_models/InstructRAG-FT/ASQA', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0)
INFO 04-10 11:21:45 utils.py:608] Found nccl from library /public/home/jfqu/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 04-10 11:21:48 selector.py:65] Cannot use FlashAttention backend for Volta and Turing GPUs.
INFO 04-10 11:21:48 selector.py:33] Using XFormers backend.
INFO 04-10 11:22:38 model_runner.py:173] Loading model weights took 14.9595 GB
INFO 04-10 11:22:40 gpu_executor.py:119] # GPU blocks: 5774, # CPU blocks: 2048
INFO 04-10 11:22:42 model_runner.py:976] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 04-10 11:22:42 model_runner.py:980] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 04-10 11:22:51 model_runner.py:1057] Graph capturing finished in 9 secs.
TEXT
  1. 加载提示词:
1
2
prompt_dict = jload(args.prompt_dict_path)
print(json.dumps(prompt_dict, ensure_ascii=False, indent=4))
PYTHON

1
2
3
4
5
6
7
8
9
10
11
12
13
14
{
"user_prefix": "<|start_header_id|>user<|end_header_id|>\n\n",
"assistant_prefix": "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
"query_prompt": "Based on your knowledge and the provided information, answer the question:\n{question}",
"demo_task_instruction": "Your task is to analyze the provided documents and answer the given question. Please generate a brief explanation of how the contents of these documents lead to your answer. If the provided information is not helpful to answer the question, you only need to respond based on your own knowledge, without referring to the documents.\n\nBelow are some examples of how to answer the question:\n\n",
"demo_prefix": "Read the following documents relevant to the given question: {question}\n\n",
"demo_postfix": "###\n\nNow it is your turn to analyze the following documents and answer the given question.\n\n",
"rationale_generation_instruction": "Please identify documents that are useful to answer the given question: '{question}', and explain how the contents lead to the answer: {answers}.\n\nIf none of the documents is aligned with the answer, in that case, you have to explain the answer only based on your own knowledge, without referring to the provided information.\n\n",
"rationale_generation_postfix_ASQA": "Note that the question may be ambiguous and have multiple correct answers. Make sure your response includes all correct answers and provides clear reasoning details followed by a concise conclusion.",
"rationale_generation_postfix_PopQA": "Note that the question mainly asks about the object entity that holds a certain relationship with the given subject entity. There may be multiple correct answers. Make sure your response includes all correct answers and provides clear reasoning details followed by a concise conclusion.",
"rationale_generation_postfix_TriviaQA": "Note that the question may be compositional and require intermediate analysis to deduce the final answer. Make sure your response is grounded and provides clear reasoning details followed by a concise conclusion.",
"rationale_generation_postfix_NaturalQuestions": "Note that the question may be compositional and require intermediate analysis to deduce the final answer. Make sure your response is grounded and provides clear reasoning details followed by a concise conclusion.",
"rationale_generation_postfix_2WikiMultiHopQA": "Note that the question may be compositional and require intermediate analysis to deduce the final answer. Make sure your response is grounded and provides clear reasoning details followed by a concise conclusion."
}
JSON
  1. 加载 data_utils.py 中的函数:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# 规范化问题:确保问题以问号结尾且开头格式正确
def normalize_question(question):
if not question.endswith("?"):
question = question + "?"
if question.startswith("."): # 检查是否以 "." 开头
question = question.lstrip(". ") # 移除开头的 "." 和空格

return question[0].lower() + question[1:] # 首字母小写

# 构建上下文文档:保证文档得分升序排列
def build_contexts(example, n_docs):
if len(example["ctxs"]) > 0 and example["ctxs"][0]["score"] > example["ctxs"][1]["score"]: # 如果第一个文档分数更高,则反转列表
ctxs_list = example["ctxs"][:n_docs][::-1]
else: # 否则直接取前n_docs个文档
ctxs_list = example["ctxs"][:n_docs]

# 格式化文档文本
docs_text = "\n\n".join([f"Document {idx+1} (Title: {ctx['title']}): {ctx['text']}" for idx, ctx in enumerate(ctxs_list)])
doc_prompt = f"{docs_text}\n\n"

return doc_prompt

# 格式化提示文本
def format_prompt(
dataset_name: str,
example: dict,
n_docs: int,
prompt_dict: dict,
tokenizer: transformers.PreTrainedTokenizer,
do_rationale_generation: bool,
demos: list = [],
) -> str:
example['question'] = normalize_question(example['question']) # 规范化问题
max_length = tokenizer.model_max_length # 最大长度

query_prompt = prompt_dict['query_prompt'].format_map(example) # 查询提示
target_prefix = "" # 目标前缀

doc_prompt = build_contexts(example, n_docs=n_docs) # 构建文档上下文
prefix = prompt_dict['user_prefix'] # 用户前缀

if do_rationale_generation: # 生成理由
query_prompt = ''
prefix += prompt_dict['demo_prefix'].format_map(example) # 添加演示前缀
target_prefix += prompt_dict['rationale_generation_instruction'].format_map(example) + prompt_dict['rationale_generation_postfix_' + dataset_name] # 添加推理生成指令

elif len(demos) > 0: # ICL
prefix += prompt_dict['demo_task_instruction'] # 添加演示任务指令

for idx, demo in enumerate(demos):
demo_question = normalize_question(demo['question']) # 规范化演示问题
demo_rationale = demo['rationale'] # 演示推理
prefix += f"###\n\nExample {idx+1}\n\nQuestion: {demo_question}\n\nAnswer: {demo_rationale}\n\n" # 添加演示示例

prefix += prompt_dict['demo_postfix'] # 添加演示后缀

prefix_tokenized_id = tokenizer(prefix, return_tensors="pt", add_special_tokens=True).input_ids # 分词前缀
prefix_len = len(prefix_tokenized_id) # 前缀长度

target_prefix += prompt_dict['assistant_prefix'] # 添加助手前缀

input_ids = tokenizer(doc_prompt + query_prompt + target_prefix, return_tensors="pt", add_special_tokens=False).input_ids # 分词输入

if input_ids.shape[-1] > max_length - prefix_len:
input_ids = input_ids[..., -(max_length - prefix_len):] # 截断超长部分
input_ids = torch.cat([prefix_tokenized_id, input_ids], axis=-1) # 组合前缀和输入

formatted_prompt = tokenizer.decode(input_ids[0], skip_special_tokens=False) # 解码为文本
return formatted_prompt

# 批量格式化提示文本
def format_prompt_with_data_list(
data_list: list[dict],
dataset_name: str,
prompt_dict: dict,
tokenizer: transformers.PreTrainedTokenizer,
n_docs: int = 5,
demos: list = [],
do_rationale_generation: bool = False,
):
data = copy.deepcopy(data_list) # 递归复制所有嵌套对象
formatted_data = [format_prompt(dataset_name, example, n_docs, prompt_dict, tokenizer, do_rationale_generation, demos) for example in tqdm(data)] # 使用进度条

return formatted_data
PYTHON
  1. 得到模型输入:
1
2
3
4
5
6
7
8
9
prompts = format_prompt_with_data_list(
data_list=test_data,
dataset_name=args.dataset_name,
prompt_dict=prompt_dict,
tokenizer=tokenizer,
n_docs=args.n_docs,
demos=demos,
)
print(json.dumps(prompts, ensure_ascii=False, indent=4))
PYTHON

100%|██████████| 1/1 [00:00<00:00, 153.02it/s]
[
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nDocument 1 (Title: Nathan's Hot Dog Eating Contest): competitive eating by downing 50 hot dogs—smashing the previous record of 25.5. The Japanese eater introduced advanced eating and training techniques that shattered previous competitive eating world records. The rise in popularity of the event coincided with the surge in popularity of the worldwide competitive eating circuit. On July 4, 2011, Sonya Thomas became the champion of the first Nathan's Hot Dog Eating Contest for Women. Previously, women and men had competed against each other, except for one all-female Memorial Day competition held in 1975. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pepto-Bismol-sponsored pink belt and\n\nDocument 2 (Title: Nathan's Hot Dog Eating Contest): Island most years since about 1972, usually in conjunction with Independence Day. Nathan's promoter Mortimer Matz claimed that on July 4, 1916, four immigrants held a hot dog eating contest at Nathan's Famous stand on Coney Island to settle an argument about who was the most patriotic. He also made the spurious claim that the contest has been held each year since then except 1941 (\"as a protest to the war in Europe\") and 1971 (as a protest to political unrest in the U.S.). A man by the name of Jim Mullen is said to have won the first contest,\n\nDocument 3 (Title: Nathan's Hot Dog Eating Contest): called to the stage individually during introductions. In 2013, six-time defending champion Joey Chestnut was escorted to the stage in a sedan chair. The competition draws many spectators and worldwide press coverage. In 2007, an estimated 50,000 came out to witness the event. In 2004 a three-story-high \"Hot Dog Eating Wall of Fame\" was erected at the site of the annual contest. The wall lists past winners, and has a digital clock which counts down the minutes until the next contest. Despite substantial damage suffered at Nathan's due to Hurricane Sandy in October 2012, the location was repaired, reopened, and\n\nDocument 4 (Title: Nathan's Hot Dog Eating Contest): Nathan's Hot Dog Eating Contest The Nathan's Hot Dog Eating Contest is an annual American hot dog competitive eating competition. It is held each year on Independence Day at Nathan's Famous Corporation's original, and best-known restaurant at the corner of Surf and Stillwell Avenues in Coney Island, a neighborhood of Brooklyn, New York City. The contest has gained public attention in recent years due to the stardom of Takeru Kobayashi and Joey Chestnut. The defending men's champion is Joey Chestnut, who ate 74 hot dogs in the 2018 contest. The defending women's champion is Miki Sudo, who ate 37 hot\n\nDocument 5 (Title: Sonya Thomas): at the time. The only eaters besides Kobayashi to defeat Thomas between the 2004 and 2005 Nathan's contests were Bill \"El Wingador\" Simmons in the controversial 2005 Wing Bowl, and Dale Boone, who won a contest eating overheated baked beans after he doused the beans in water to cool them down. On July 4, 2011, Thomas became the first champion of Nathan's Hot Dog Eating Contest for Women. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pink Belt and won $10,000. Thomas had won 1st place in the National Buffalo Wing Festival US chicken wing eating championship\n\nBased on your knowledge and the provided information, answer the question:\nwho won nathan's hot dog eating contest?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
]
TEXT
  1. 得到采样参数:
1
2
3
4
5
sampling_params = SamplingParams(temperature=args.temperature,
max_tokens=args.max_tokens,
seed=args.seed,
stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")])
sampling_params
PYTHON

SamplingParams(n=1, best_of=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0, top_p=1.0, top_k=-1, min_p=0.0, seed=42, use_beam_search=False, length_penalty=1.0, early_stopping=False, stop=[], stop_token_ids=[128009, 128009], include_stop_str_in_output=False, ignore_eos=False, max_tokens=4096, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None)
TEXT
  1. 模型推理,得到模型输出:
1
2
outputs = llm.generate(prompts, sampling_params)
outputs[0].outputs[0].text
PYTHON

Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.43s/it]
'The documents provided do not directly answer the question "who won Nathan\'s Hot Dog Eating Contest?" However, they do provide information about the contest, its history, and past winners.\n\nFrom Document 1, we learn that Takeru Kobayashi set a new record by eating 50 hot dogs in 2006, and Sonya Thomas became the champion of the first Nathan\'s Hot Dog Eating Contest for Women in 2011.\n\nFrom Document 4, we learn that the defending men\'s champion is Joey Chestnut, who ate 74 hot dogs in the 2018 contest, and the defending women\'s champion is Miki Sudo, who ate 37 hot dogs in the same contest.\n\nFrom Document 5, we learn that Sonya Thomas became the first champion of Nathan\'s Hot Dog Eating Contest for Women in 2011, eating 40 hot dogs in 10 minutes.\n\nBased on this information, we can conclude that:\n\n* Takeru Kobayashi won the Nathan\'s Hot Dog Eating Contest in 2006, eating 50 hot dogs.\n* Sonya Thomas won the Nathan\'s Hot Dog Eating Contest for Women in 2011, eating 40 hot dogs.\n* Joey Chestnut is the defending men\'s champion, having eaten 74 hot dogs in the 2018 contest.\n* Miki Sudo is the defending women\'s champion, having eaten 37 hot dogs in the 2018 contest.\n\nTherefore, the answer to the question "who won Nathan\'s Hot Dog Eating Contest?" is not a single answer, but rather a list of past winners, including Takeru Kobayashi, Sonya Thomas, Joey Chestnut, and Miki Sudo.'
TEXT
  1. 保存结果:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 保存输出结果
def save_outputs(outputs, test_data, output_file, n_docs):
output_data = []
for i, output in enumerate(outputs):
prompt = output.prompt # 获取模型输入
generated_text = output.outputs[0].text # 获取模型输出
sample = test_data[i] # 获取对应的源数据

# 构建输出数据结构
output_data.append({
"question": sample["question"], # 问题
"answers": sample["answers"], # 答案
"qa_pairs": sample["qa_pairs"] if "qa_pairs" in sample else None, # QA对
"rationale": generated_text, # 生成的理由
"prompt": prompt, # 模型输入
"ctxs": sample["ctxs"][:n_docs][::-1] if (sample["ctxs"][0]['score'] > sample["ctxs"][1]['score']) else
sample["ctxs"][:n_docs], # 上下文文档,根据分数升序
})

# 将输出数据保存为JSON文件
jdump(output_data, output_file)
print(f"Outputs saved to {output_file}")

return output_data

output_file = os.path.join(args.output_dir, "result.json")
eval_results = save_outputs(outputs, test_data, output_file, args.n_docs)
print(json.dumps(eval_results, ensure_ascii=False, indent=4))
PYTHON

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
Outputs saved to ./result.json
[
{
"question": "Who won nathan's hot dog eating contest?",
"answers": [
"The Nathan's Hot Dog Eating Contest is an annual American hot dog eating competition held on Independence Day at Nathan's Famous Corporation's original restaurant at the corner of Surf and Stillwell Avenues in Coney Island, a neighborhood of Brooklyn, New York City. The current men's and women's competitions champions are Joey Chestnut and Michelle Lesco, who crowned themselves in the 2021 edition. Previously, Miki Sudo had won the women's competition every year from 2014-2020, with Chestnut doing so in the men's variant in 2017 and 2016 and Matt Stonie in 2015."
],
"qa_pairs": [
{
"question": "Who won the men's competition of Nathan's Hot Dog Eating Contest in 2017?",
"answers": [
"Joey Chestnut"
]
},
{
"question": "Who won the women's competition of Nathan's Hot Dog Eating Contest in 2017?",
"answers": [
"Miki Sudo"
]
},
{
"question": "Who won the men's competition of Nathan's Hot Dog Eating Contest in 2016?",
"answers": [
"Joey Chestnut"
]
},
{
"question": "Who won the women's competition of Nathan's Hot Dog Eating Contest in 2016?",
"answers": [
"Miki Sudo"
]
},
{
"question": "Who won the men's competition of Nathan's Hot Dog Eating Contest in 2015?",
"answers": [
"Matt Stonie"
]
},
{
"question": "Who won the women's competition of Nathan's Hot Dog Eating Contest in 2015?",
"answers": [
"Miki Sudo"
]
}
],
"rationale": "The documents provided do not directly answer the question \"who won Nathan's Hot Dog Eating Contest?\" However, they do provide information about the contest, its history, and past winners.\n\nFrom Document 1, we learn that Takeru Kobayashi set a new record by eating 50 hot dogs in 2006, and Sonya Thomas became the champion of the first Nathan's Hot Dog Eating Contest for Women in 2011.\n\nFrom Document 4, we learn that the defending men's champion is Joey Chestnut, who ate 74 hot dogs in the 2018 contest, and the defending women's champion is Miki Sudo, who ate 37 hot dogs in the same contest.\n\nFrom Document 5, we learn that Sonya Thomas became the first champion of Nathan's Hot Dog Eating Contest for Women in 2011, eating 40 hot dogs in 10 minutes.\n\nBased on this information, we can conclude that:\n\n* Takeru Kobayashi won the Nathan's Hot Dog Eating Contest in 2006, eating 50 hot dogs.\n* Sonya Thomas won the Nathan's Hot Dog Eating Contest for Women in 2011, eating 40 hot dogs.\n* Joey Chestnut is the defending men's champion, having eaten 74 hot dogs in the 2018 contest.\n* Miki Sudo is the defending women's champion, having eaten 37 hot dogs in the 2018 contest.\n\nTherefore, the answer to the question \"who won Nathan's Hot Dog Eating Contest?\" is not a single answer, but rather a list of past winners, including Takeru Kobayashi, Sonya Thomas, Joey Chestnut, and Miki Sudo.",
"prompt": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nDocument 1 (Title: Nathan's Hot Dog Eating Contest): competitive eating by downing 50 hot dogs—smashing the previous record of 25.5. The Japanese eater introduced advanced eating and training techniques that shattered previous competitive eating world records. The rise in popularity of the event coincided with the surge in popularity of the worldwide competitive eating circuit. On July 4, 2011, Sonya Thomas became the champion of the first Nathan's Hot Dog Eating Contest for Women. Previously, women and men had competed against each other, except for one all-female Memorial Day competition held in 1975. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pepto-Bismol-sponsored pink belt and\n\nDocument 2 (Title: Nathan's Hot Dog Eating Contest): Island most years since about 1972, usually in conjunction with Independence Day. Nathan's promoter Mortimer Matz claimed that on July 4, 1916, four immigrants held a hot dog eating contest at Nathan's Famous stand on Coney Island to settle an argument about who was the most patriotic. He also made the spurious claim that the contest has been held each year since then except 1941 (\"as a protest to the war in Europe\") and 1971 (as a protest to political unrest in the U.S.). A man by the name of Jim Mullen is said to have won the first contest,\n\nDocument 3 (Title: Nathan's Hot Dog Eating Contest): called to the stage individually during introductions. In 2013, six-time defending champion Joey Chestnut was escorted to the stage in a sedan chair. The competition draws many spectators and worldwide press coverage. In 2007, an estimated 50,000 came out to witness the event. In 2004 a three-story-high \"Hot Dog Eating Wall of Fame\" was erected at the site of the annual contest. The wall lists past winners, and has a digital clock which counts down the minutes until the next contest. Despite substantial damage suffered at Nathan's due to Hurricane Sandy in October 2012, the location was repaired, reopened, and\n\nDocument 4 (Title: Nathan's Hot Dog Eating Contest): Nathan's Hot Dog Eating Contest The Nathan's Hot Dog Eating Contest is an annual American hot dog competitive eating competition. It is held each year on Independence Day at Nathan's Famous Corporation's original, and best-known restaurant at the corner of Surf and Stillwell Avenues in Coney Island, a neighborhood of Brooklyn, New York City. The contest has gained public attention in recent years due to the stardom of Takeru Kobayashi and Joey Chestnut. The defending men's champion is Joey Chestnut, who ate 74 hot dogs in the 2018 contest. The defending women's champion is Miki Sudo, who ate 37 hot\n\nDocument 5 (Title: Sonya Thomas): at the time. The only eaters besides Kobayashi to defeat Thomas between the 2004 and 2005 Nathan's contests were Bill \"El Wingador\" Simmons in the controversial 2005 Wing Bowl, and Dale Boone, who won a contest eating overheated baked beans after he doused the beans in water to cool them down. On July 4, 2011, Thomas became the first champion of Nathan's Hot Dog Eating Contest for Women. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pink Belt and won $10,000. Thomas had won 1st place in the National Buffalo Wing Festival US chicken wing eating championship\n\nBased on your knowledge and the provided information, answer the question:\nwho won nathan's hot dog eating contest?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
"ctxs": [
{
"id": "3360010",
"title": "Nathan's Hot Dog Eating Contest",
"text": "competitive eating by downing 50 hot dogs—smashing the previous record of 25.5. The Japanese eater introduced advanced eating and training techniques that shattered previous competitive eating world records. The rise in popularity of the event coincided with the surge in popularity of the worldwide competitive eating circuit. On July 4, 2011, Sonya Thomas became the champion of the first Nathan's Hot Dog Eating Contest for Women. Previously, women and men had competed against each other, except for one all-female Memorial Day competition held in 1975. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pepto-Bismol-sponsored pink belt and",
"score": 0.7802734375
},
{
"id": "3360007",
"title": "Nathan's Hot Dog Eating Contest",
"text": "Island most years since about 1972, usually in conjunction with Independence Day. Nathan's promoter Mortimer Matz claimed that on July 4, 1916, four immigrants held a hot dog eating contest at Nathan's Famous stand on Coney Island to settle an argument about who was the most patriotic. He also made the spurious claim that the contest has been held each year since then except 1941 (\"as a protest to the war in Europe\") and 1971 (as a protest to political unrest in the U.S.). A man by the name of Jim Mullen is said to have won the first contest,",
"score": 0.7802734375
},
{
"id": "3360012",
"title": "Nathan's Hot Dog Eating Contest",
"text": "called to the stage individually during introductions. In 2013, six-time defending champion Joey Chestnut was escorted to the stage in a sedan chair. The competition draws many spectators and worldwide press coverage. In 2007, an estimated 50,000 came out to witness the event. In 2004 a three-story-high \"Hot Dog Eating Wall of Fame\" was erected at the site of the annual contest. The wall lists past winners, and has a digital clock which counts down the minutes until the next contest. Despite substantial damage suffered at Nathan's due to Hurricane Sandy in October 2012, the location was repaired, reopened, and",
"score": 0.7958984375
},
{
"id": "3360002",
"title": "Nathan's Hot Dog Eating Contest",
"text": "Nathan's Hot Dog Eating Contest The Nathan's Hot Dog Eating Contest is an annual American hot dog competitive eating competition. It is held each year on Independence Day at Nathan's Famous Corporation's original, and best-known restaurant at the corner of Surf and Stillwell Avenues in Coney Island, a neighborhood of Brooklyn, New York City. The contest has gained public attention in recent years due to the stardom of Takeru Kobayashi and Joey Chestnut. The defending men's champion is Joey Chestnut, who ate 74 hot dogs in the 2018 contest. The defending women's champion is Miki Sudo, who ate 37 hot",
"score": 0.7978515625
},
{
"id": "3425375",
"title": "Sonya Thomas",
"text": "at the time. The only eaters besides Kobayashi to defeat Thomas between the 2004 and 2005 Nathan's contests were Bill \"El Wingador\" Simmons in the controversial 2005 Wing Bowl, and Dale Boone, who won a contest eating overheated baked beans after he doused the beans in water to cool them down. On July 4, 2011, Thomas became the first champion of Nathan's Hot Dog Eating Contest for Women. Eating 40 hot dogs in 10 minutes, Thomas earned the inaugural Pink Belt and won $10,000. Thomas had won 1st place in the National Buffalo Wing Festival US chicken wing eating championship",
"score": 0.8037109375
}
]
}
]
JSON
  1. 加载 metrics.py 中的函数:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# 标准化答案文本
def normalize_answer(s):
def remove_articles(text): # 移除冠词(a, an, the)
return re.sub(r"\b(a|an|the)\b", " ", text)

def white_space_fix(text): # 修复多余的空格,确保单词之间只有一个空格
return " ".join(text.split())

def remove_punc(text): # 移除所有标点符号
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)

def lower(text): # 将文本转换为小写
return text.lower()

# 依次应用上述函数:小写化 -> 移除标点 -> 移除冠词 -> 修复空格
return white_space_fix(remove_articles(remove_punc(lower(s))))

# 检查答案列表中是否有任一答案出现在给定的上下文中
def exact_presence(answers, context):
# 标准化所有答案和上下文
answers = [normalize_answer(ans) for ans in answers]
context = normalize_answer(context)

# 遍历每个答案,检查是否在上下文中出现
for ans in answers:
if ans in context:
return True

return False # 所有答案均未出现

# 计算STR-EM指标(仅适用于ASQA数据集)
def compute_str_em(data):
# 检查数据是否包含qa_pairs字段
if 'qa_pairs' not in data[0] or data[0]['qa_pairs'] is None:
return 0, 0

acc = [] # 存储每个QA对的准确率
hit = [] # 存储是否完全命中(所有QA对均准确)

for item in data:
loc_acc = [] # 临时存储当前item中每个QA对的检查结果
for qa_pair in item['qa_pairs']:
# 检查当前QA对的答案是否出现在rationale中
loc_acc.append(exact_presence(qa_pair['answers'], item["rationale"]))

# 计算当前item的准确率和是否完全命中
acc.append(np.mean(loc_acc))
hit.append(int(np.mean(loc_acc) == 1))

# 返回平均准确率和完全命中率(转换为百分比)
return 100 * np.mean(acc), 100 * np.mean(hit)

# 计算并保存评估指标
def get_metrics(data, save_dir=None, is_asqa=False):
idx = 0
num_accurate = 0
print('Evaluating results...')
if is_asqa:
# 如果是ASQA数据集,计算STR-EM指标
rationale_str_em, _ = compute_str_em(data)
else:
# 否则遍历数据,检查每个答案是否出现在rationale中
for d in tqdm(data):
idx += 1
is_accurate = exact_presence(d['answers'], d['rationale'])
num_accurate += 1 if is_accurate else 0

if is_asqa:
# 打印并保存ASQA的评估结果
print(f"Rationale EM: {rationale_str_em:.1f}%")
eval_result = {"EM": rationale_str_em, "num_examples": idx}
else:
# 打印并保存普通数据集的评估结果
accuracy = num_accurate / idx * 100
print(f"Accuracy: {accuracy:.1f}%")
eval_result = {"accuracy": accuracy, "num_examples": idx}

# 将评估结果保存到JSON文件
with open(f"{save_dir}/metrics.json", "w") as f:
f.write(json.dumps(eval_result) + "\n")

return eval_result # 返回评估结果
PYTHON
  1. 评估结果:
1
get_metrics(eval_results, args.output_dir, is_asqa=args.dataset_name == 'ASQA')
PYTHON

Evaluating results...
Rationale EM: 83.3%
{'EM': 83.33333333333334, 'num_examples': 0}
TEXT

【论文复现】InstructRAG
http://xuan-van.github.io/代码复现/【论文复现】instructrag/
作者
文晋
发布于
2025年2月26日
许可协议