Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
349 changes: 349 additions & 0 deletions biodatasets/mediqa_ans/mediqa_ans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health
Questions. The first summarization collection containing question-driven summaries of answers to consumer health
questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using
extractive or abstractive approaches.

This dataset contains 8 different subsets which support both source and bigbio schema. These subsets arise from
the permutation of three different settings:
- [page2answer, section2answer]: Full page as context / manually selected passages as context
- [multi, single]: Generate summary of answer for specific question across multiple documents /
Generate summary of answer for specific question for each document (each document is sample)
- [abstractive, extractive]: Abstractive summary / extractive summary


List of subset_ids:
- mediqa_ans_page2answer_multi_abstractive
This split contains the question ID and question, the answer ID and the full text of the web pages, the
corresponding rating for each answer, and the multi-document abstractive summary.
- mediqa_ans_page2answer_multi_extractive
Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
for each answer, and the multi-document extractive summary.
- mediqa_ans_page2answer_single_abstractive
Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
for each answer, and the single document abstractive summary for each answer.
- mediqa_ans_page2answer_single_extractive
Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
for each answer, and the single document extractive summary for each answer.
- mediqa_ans_section2answer_multi_abstractive
Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
for each answer, and the multi-document abstractive summary for each answer.
- mediqa_ans_section2answer_multi_extractive
Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
for each answer, and the multi-document extractive summary for each answer.
- mediqa_ans_section2answer_single_abstractive
Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
for each answer, and the single document abstractive summary for each answer.
- mediqa_ans_section2answer_single_extractive
Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
for each answer, and the single document extractive summary for each answer.

Furthermore there exists the subset mediqa_ans_all for which there only exists a source schema and contains
all questions, pages, passages, ratings, urls, and each type of summaries.
"""

import itertools as it
import json
import os
from typing import Dict, List, Tuple

import datasets

from utils import schemas
from utils.configs import BigBioConfig
from utils.constants import Tasks

_CITATION = """\
@article{,
author={Savery, Max
and Abacha, Asma Ben
and Gayen, Soumya
and Demner-Fushman, Dina},
title={Question-driven summarization of answers to consumer health questions},
journal={Scientific Data},
year={2020},
month={Oct},
day={02},
volume={7},
number={1},
pages={322},
issn={2052-4463},
doi={10.1038/s41597-020-00667-z},
url={https://doi.org/10.1038/s41597-020-00667-z}
}
"""

_DATASETNAME = "mediqa_ans"

_DESCRIPTION = """\
Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health
Questions. The first summarization collection containing question-driven summaries of answers to consumer health
questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using
extractive or abstractive approaches.

This dataset contains 8 different subsets which support both source and bigbio schema. These subsets arise from
the permutation of three different settings:
- [page2answer, section2answer]: Full page as context / manually selected passages as context
- [multi, single]: Generate summary of answer for specific question across multiple documents /
Generate summary of answer for specific question for each document (each document is sample)
- [abstractive, extractive]: Abstractive summary / extractive summary


List of subset_ids:
- mediqa_ans_page2answer_multi_abstractive
This split contains the question ID and question, the answer ID and the full text of the web pages, the
corresponding rating for each answer, and the multi-document abstractive summary.
- mediqa_ans_page2answer_multi_extractive
Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
for each answer, and the multi-document extractive summary.
- mediqa_ans_page2answer_single_abstractive
Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
for each answer, and the single document abstractive summary for each answer.
- mediqa_ans_page2answer_single_extractive
Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
for each answer, and the single document extractive summary for each answer.
- mediqa_ans_section2answer_multi_abstractive
Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
for each answer, and the multi-document abstractive summary for each answer.
- mediqa_ans_section2answer_multi_extractive
Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
for each answer, and the multi-document extractive summary for each answer.
- mediqa_ans_section2answer_single_abstractive
Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
for each answer, and the single document abstractive summary for each answer.
- mediqa_ans_section2answer_single_extractive
Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
for each answer, and the single document extractive summary for each answer.

Furthermore there exists the subset mediqa_ans_all for which there only exists a source schema and contains
all questions, pages, passages, ratings, urls, and each type of summaries.
"""

_HOMEPAGE = "https://osf.io/fyg46/"

_LICENSE = "CC0"

_URLS = {
_DATASETNAME: "https://osf.io/fs57e/download",
}

_SUPPORTED_TASKS = [Tasks.SUMMARIZATION]

_SOURCE_VERSION = "1.0.0"

_BIGBIO_VERSION = "1.0.0"


class MediqaAnsDataset(datasets.GeneratorBasedBuilder):
"""
A dataset of manually generated, question-driven summaries of multi and
single document answers to consumer health questions.
"""

SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)

BUILDER_CONFIGS = []

BUILDER_CONFIGS.append(
BigBioConfig(
name="mediqa_ans_all_source",
version=BIGBIO_VERSION,
description="MEDIQA-AnS All source schema",
schema="source",
subset_id="mediqa_ans_all",
),
)

for setting1 in ["page2answer", "section2answer"]:
for setting2 in ["multi", "single"]:
for setting3 in ["abstractive", "extractive"]:
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_bigbio_t2t",
version=BIGBIO_VERSION,
description=f"MEDIQA-AnS {setting1} {setting2} {setting3} BigBio schema",
schema="bigbio_t2t",
subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}",
)
)
BUILDER_CONFIGS.append(
BigBioConfig(
name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_source",
version=BIGBIO_VERSION,
description=f"MEDIQA-AnS {setting1} {setting2} {setting3} source schema",
schema="source",
subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}",
),
)

DEFAULT_CONFIG_NAME = "mediqa_ans_page2answer_multi_abstractive_source"

def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source" and self.config.subset_id == "mediqa_ans_all":
features = datasets.Features(
{
"id": datasets.Value("string"),
"question": datasets.Value("string"),
"multi_abs_summ": datasets.Value("string"),
"multi_ext_summ": datasets.Value("string"),
"answers": [
{
"id": datasets.Value("string"),
"answer_abs_summ": datasets.Value("string"),
"answer_ext_summ": datasets.Value("string"),
"section": datasets.Value("string"),
"article": datasets.Value("string"),
"url": datasets.Value("string"),
"rating": datasets.Value("string"),
}
],
}
)
elif self.config.schema == "source":
features = datasets.Features(
{
"question": datasets.Value("string"),
"question_id": datasets.Value("string"),
"summary": datasets.Value("string"),
"articles": [
{
"answer_id": datasets.Value("string"),
"text": datasets.Value("string"),
"rating": datasets.Value("string"),
}
],
}
)
elif self.config.schema == "bigbio_t2t":
features = schemas.text2text_features

return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""

urls = _URLS[_DATASETNAME]
file_path = dl_manager.download_and_extract(urls)

return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(file_path),
},
),
]

def _generate_examples(self, filepath) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""

dataset = None
with open(filepath, "r", encoding="utf8") as infile:
dataset = json.load(infile)

uid = it.count(0)
if self.config.name == "mediqa_ans_all_source":
dataset = self._json_dict_to_list(dataset, "id")
for example in dataset:
example["answers"] = self._json_dict_to_list(example["answers"], "id")
yield example["id"], example
else:
_, setting1, setting2, setting3 = self.config.subset_id.rsplit("_", 3)
if self.config.schema == "source":
for example in self._generate_setting_examples(dataset, setting1, setting2, setting3):
yield next(uid), example
elif self.config.schema == "bigbio_t2t":
for example in self._generate_setting_examples(dataset, setting1, setting2, setting3):
example = self._source_to_t2t(example)
example["id"] = next(uid)
yield example["id"], example

def _generate_setting_examples(self, dataset, setting1, setting2, setting3):
for question_id, question in dataset.items():
example = {}
example["question_id"] = question_id
example["question"] = question["question"]
if setting2 == "single":
for answer_id, answer in question["answers"].items():
example_ = example.copy()
if setting1 == "section2answer":
example_["articles"] = [
{"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}
]
elif setting1 == "page2answer":
example_["articles"] = [
{"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}
]
if setting3 == "abstractive":
example_["summary"] = answer["answer_abs_summ"]
elif setting3 == "extractive":
example_["summary"] = answer["answer_ext_summ"]
yield example_
elif setting2 == "multi":
example["articles"] = []
for answer_id, answer in question["answers"].items():
if setting1 == "section2answer":
example["articles"].append(
{"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}
)
elif setting1 == "page2answer":
example["articles"].append(
{"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}
)

if setting3 == "abstractive":
example["summary"] = question["multi_abs_summ"]
elif setting3 == "extractive":
example["summary"] = question["multi_ext_summ"]
yield example

def _source_to_t2t(self, example):
example_ = {}
example_["document_id"] = ""
example_["text_1_name"] = ""
example_["text_2_name"] = ""

text1 = ""
text1 += "Question ID: " + example["question_id"] + "\n"
text1 += "Question: " + example["question"] + "\n"
for article in example["articles"]:
text1 += "Answer ID: " + article["answer_id"] + "\n"
text1 += "Answer: " + article["text"] + "\n"
text1 += "Rating: " + article["rating"] + "\n"
example_["text_1"] = text1

example_["text_2"] = example["summary"]

return example_

def _json_dict_to_list(self, json, new_key):
list_ = []
for key, values in json.items():
assert isinstance(values, dict), "Child element is not a dict"
assert new_key not in values, "New key already in values"
values[new_key] = key
list_.append(values)
return list_