【数据准备】常用问答数据集

下面是一些 RAG 领域常用的数据集，并对部分进行了详细介绍。

MuSiQue

项目地址：stonybrooknlp/musique
下载地址：google drive

ans_ 是上下文包含正确答案的问答，full_ 是上下文检索的问答。

ans_train.jsonl: 19938, ans_dev.jsonl: 2417, full_train.jsonl: 39876, full_dev.jsonl: 4834 结构如下：

{
    "id": str, # 'hop' 前的数字表示分解的子问题数
    "paragraphs": [
        {
            "idx": int,
            "title": str,
            "paragraph_text": str,
            "is_supporting": bool # 该段落是否支持回答当前的问题
        }
        ...
    ],
    "question": str,
    "question_decomposition": [
        {
            "id": int,
            "question": str,
            "answer": str,
            "paragraph_support_idx": int # 支持分解问题答案的段落索引
        },
        ...
    ],
    "answer": str,
    "answer_aliases": str list, # 答案的别名或同义词列表
    "answerable": bool # 当前问题是否可以被回答
}

ans_test.jsonl: 2459, full_test.jsonl: 4918 结构如下：

{
    "id": str, # 'hop' 前的数字表示分解的子问题数
    "paragraphs": [
        {
            "idx": int,
            "title": str,
            "paragraph_text": str
        }
        ...
    ],
    "question": str
}

HotpotQA

项目地址：hotpotqa/hotpot
下载地址：hotpotqa

Windows PowerShell 可以使用：Invoke-WebRequest -Uri json文件链接 -OutFile 文件保存路径 下载。

train.json: 90447, dev_distractor.json（包含黄金上下文的）: 7405, dev_fullwiki.json（包含检索上下文的）: 7405 结构如下：

{
    "supporting_facts": [ # 支持答案的文档标题和句子
        [ str, int ] # [标题, 句子序号]
        ...
    ],
    "level": str, # 问题难度: easy、medium、hard
    "question": str,
    "context": [ # 上下文文档
        [
            str, # 标题
            [ # 句子列表
                str,    
                ...
            ]
        ],
        ...
    ],
    "answer": str,
    "_id": str,
    "type": str # 问答类型: bridge（跨越多个文档）、comparison（对比多个事实）
}

test.json: 7405 结构如下：

{
    "_id": str,
    "question": str,
    "context": [ # 上下文文档
        [
            str, # 标题
            [ # 句子列表
                str,    
                ...
            ]
        ],
        ...
    ]
}

2WikiMultiHopQA

项目地址：Alab-NII/2wikimultihop
下载地址：dropbox

train.json: 167454, dev.json: 12576, test.json（"supporting_facts", "evidences", "answer" 为空）: 12576 结构如下：

{
    "_id": str,
    "type": str, # 问答类型: compositional、inference、bridge_comparison、comparison
    "question": str,
    "context": [ # 上下文文档
        [
            str, # 标题
            [ # 句子列表
                str,
                ...
            ]
        ],
        ...
    ],
    "supporting_facts": [ # 支持答案的文档标题和句子
        [ str, int ] # [标题, 句子序号]
        ...
    ],
    "evidences": [ # 三元组列表，对应 supporting_facts 的句子
        [ str, str, str ],
        ...
    ],
    "answer": str
}

PopQA

项目地址：AlexTMallen/adaptive-retrieval
下载地址：akariasai/PopQA

源文件是 tsv格式，可以转换为 json 格式：

import csv
import json


# 将 TSV 文件转换为 JSON 文件
def tsv_to_json(tsv_file_path, json_file_path):
    # 打开 TSV 文件并读取内容
    with open(tsv_file_path, 'r', encoding='utf-8') as tsv_file:
        # 使用 csv.DictReader 读取 TSV 文件，指定分隔符为制表符
        reader = csv.DictReader(tsv_file, delimiter='\t')

        # 将每一行转换为字典，并存储到列表中
        data = []
        for row in reader:
            # 遍历每一行，检查特定字段是否需要转换为列表
            for key in ["s_aliases", "o_aliases", "possible_answers"]:
                if key in row and row[key].startswith("[") and row[key].endswith("]"):
                    try:
                        # 将字符串形式的列表转换为真正的列表
                        row[key] = json.loads(row[key])
                    except json.JSONDecodeError:
                        print(f"无法解析字段 {key} 的值：{row[key]}")

            # 遍历每一行，检查特定字段是否需要转换为整数
            for key in ["id", "subj_id", "prop_id", "obj_id", "s_pop", "o_pop"]:
                if key in row:
                    try:
                        # 将字符串转换为整数
                        row[key] = int(row[key])
                    except ValueError:
                        print(f"无法将字段 {key} 的值 {row[key]} 转换为整数")

            data.append(row)

    # 将数据写入 JSON 文件
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

    print(f"TSV 文件已成功转换为 JSON 文件：{json_file_path}")


if __name__ == "__main__":
    tsv_to_json("dataset/PopQA/test.tsv", "dataset/PopQA/test.json")

test.json: 14267 结构如下：

{
    "id": int,
    "subj": str, # 主体实体
    "prop": str, # 关系类型
    "obj": str,  # 客体实体
    "subj_id": int,
    "prop_id": int,
    "obj_id": int,
    "s_aliases": [ str, ... ], # 主体别名列表
    "o_aliases": [ str, ... ], # 客体别名列表
    "s_uri": str, # 主体维基百科网址
    "o_uri": str, # 客体维基百科网址
    "s_wiki_title": str, # 主体维基百科标题
    "o_wiki_title": str, # 客体维基百科标题
    "s_pop": int, # 主体维基百科浏览量/月
    "o_pop": int, # 客体维基百科浏览量/月
    "question": str,
    "possible_answers": [ str, ... ] # 答案列表
}

ASQA

项目地址：google-research/language/asqa
下载地址：din0s/asqa

源文件是 parquet格式，可以转换为 json 格式：

import pandas as pd


# 将 Parquet 文件转换为 JSON 文件
def parquet_to_json(parquet_file_path, json_file_path):
    try:
        # 使用 pandas 读取 Parquet 文件
        df = pd.read_parquet(parquet_file_path)

        # 将 DataFrame 转换为 JSON 格式并保存到文件
        df.to_json(json_file_path, orient='records', force_ascii=False)

        print(f"Parquet 文件已成功转换为 JSON 文件：{json_file_path}")
    except Exception as e:
        print(f"转换过程中发生错误：{e}")


if __name__ == "__main__":
    parquet_to_json("dataset/ASQA/train.parquet", "dataset/ASQA/train.json")
    parquet_to_json("dataset/ASQA/dev.parquet", "dataset/ASQA/dev.json")

train.json: 4353, dev.json: 948：

{
    "ambiguous_question": str,
    "qa_pairs": [ # 消除歧义的问答对列表
        {
            "context": str, # 上下文
            "question": str, # 消除歧义问题
            "short_answers": [ # 简短答案列表
                str,
                ...
            ],
            "wikipage": str # 上下文的维基百科标题
        },
        ...
    ],
    "wikipages": [ # 维基百科页面列表
        {
            "title": str, # 标题
            "url": str # 网址
        }
        ...
    ],
    "annotations": [ # 答案列表
        {
            "knowledge": [ # 上下文列表
                {
                    "content": str, # 上下文
                    "wikipage": str # 上下文的维基百科标题
                }
                ...
            ],
            "long_answer": str
        }
        ...
    ],
    "sample_id": str
}