diff --git a/biodatasets/mediqa_ans/mediqa_ans.py b/biodatasets/mediqa_ans/mediqa_ans.py new file mode 100644 index 00000000..55d627f9 --- /dev/null +++ b/biodatasets/mediqa_ans/mediqa_ans.py @@ -0,0 +1,349 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health +Questions. The first summarization collection containing question-driven summaries of answers to consumer health +questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using +extractive or abstractive approaches. + +This dataset contains 8 different subsets which support both source and bigbio schema. These subsets arise from +the permutation of three different settings: +- [page2answer, section2answer]: Full page as context / manually selected passages as context +- [multi, single]: Generate summary of answer for specific question across multiple documents / + Generate summary of answer for specific question for each document (each document is sample) +- [abstractive, extractive]: Abstractive summary / extractive summary + + +List of subset_ids: +- mediqa_ans_page2answer_multi_abstractive + This split contains the question ID and question, the answer ID and the full text of the web pages, the + corresponding rating for each answer, and the multi-document abstractive summary. +- mediqa_ans_page2answer_multi_extractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the multi-document extractive summary. +- mediqa_ans_page2answer_single_abstractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the single document abstractive summary for each answer. +- mediqa_ans_page2answer_single_extractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the single document extractive summary for each answer. +- mediqa_ans_section2answer_multi_abstractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the multi-document abstractive summary for each answer. +- mediqa_ans_section2answer_multi_extractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the multi-document extractive summary for each answer. +- mediqa_ans_section2answer_single_abstractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the single document abstractive summary for each answer. +- mediqa_ans_section2answer_single_extractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the single document extractive summary for each answer. + +Furthermore there exists the subset mediqa_ans_all for which there only exists a source schema and contains +all questions, pages, passages, ratings, urls, and each type of summaries. +""" + +import itertools as it +import json +import os +from typing import Dict, List, Tuple + +import datasets + +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_CITATION = """\ +@article{, + author={Savery, Max + and Abacha, Asma Ben + and Gayen, Soumya + and Demner-Fushman, Dina}, + title={Question-driven summarization of answers to consumer health questions}, + journal={Scientific Data}, + year={2020}, + month={Oct}, + day={02}, + volume={7}, + number={1}, + pages={322}, + issn={2052-4463}, + doi={10.1038/s41597-020-00667-z}, + url={https://doi.org/10.1038/s41597-020-00667-z} +} +""" + +_DATASETNAME = "mediqa_ans" + +_DESCRIPTION = """\ +Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health +Questions. The first summarization collection containing question-driven summaries of answers to consumer health +questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using +extractive or abstractive approaches. + +This dataset contains 8 different subsets which support both source and bigbio schema. These subsets arise from +the permutation of three different settings: +- [page2answer, section2answer]: Full page as context / manually selected passages as context +- [multi, single]: Generate summary of answer for specific question across multiple documents / + Generate summary of answer for specific question for each document (each document is sample) +- [abstractive, extractive]: Abstractive summary / extractive summary + + +List of subset_ids: +- mediqa_ans_page2answer_multi_abstractive + This split contains the question ID and question, the answer ID and the full text of the web pages, the + corresponding rating for each answer, and the multi-document abstractive summary. +- mediqa_ans_page2answer_multi_extractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the multi-document extractive summary. +- mediqa_ans_page2answer_single_abstractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the single document abstractive summary for each answer. +- mediqa_ans_page2answer_single_extractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the single document extractive summary for each answer. +- mediqa_ans_section2answer_multi_abstractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the multi-document abstractive summary for each answer. +- mediqa_ans_section2answer_multi_extractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the multi-document extractive summary for each answer. +- mediqa_ans_section2answer_single_abstractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the single document abstractive summary for each answer. +- mediqa_ans_section2answer_single_extractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the single document extractive summary for each answer. + +Furthermore there exists the subset mediqa_ans_all for which there only exists a source schema and contains +all questions, pages, passages, ratings, urls, and each type of summaries. +""" + +_HOMEPAGE = "https://osf.io/fyg46/" + +_LICENSE = "CC0" + +_URLS = { + _DATASETNAME: "https://osf.io/fs57e/download", +} + +_SUPPORTED_TASKS = [Tasks.SUMMARIZATION] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + + +class MediqaAnsDataset(datasets.GeneratorBasedBuilder): + """ + A dataset of manually generated, question-driven summaries of multi and + single document answers to consumer health questions. + """ + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [] + + BUILDER_CONFIGS.append( + BigBioConfig( + name="mediqa_ans_all_source", + version=BIGBIO_VERSION, + description="MEDIQA-AnS All source schema", + schema="source", + subset_id="mediqa_ans_all", + ), + ) + + for setting1 in ["page2answer", "section2answer"]: + for setting2 in ["multi", "single"]: + for setting3 in ["abstractive", "extractive"]: + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_bigbio_t2t", + version=BIGBIO_VERSION, + description=f"MEDIQA-AnS {setting1} {setting2} {setting3} BigBio schema", + schema="bigbio_t2t", + subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}", + ) + ) + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_source", + version=BIGBIO_VERSION, + description=f"MEDIQA-AnS {setting1} {setting2} {setting3} source schema", + schema="source", + subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}", + ), + ) + + DEFAULT_CONFIG_NAME = "mediqa_ans_page2answer_multi_abstractive_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source" and self.config.subset_id == "mediqa_ans_all": + features = datasets.Features( + { + "id": datasets.Value("string"), + "question": datasets.Value("string"), + "multi_abs_summ": datasets.Value("string"), + "multi_ext_summ": datasets.Value("string"), + "answers": [ + { + "id": datasets.Value("string"), + "answer_abs_summ": datasets.Value("string"), + "answer_ext_summ": datasets.Value("string"), + "section": datasets.Value("string"), + "article": datasets.Value("string"), + "url": datasets.Value("string"), + "rating": datasets.Value("string"), + } + ], + } + ) + elif self.config.schema == "source": + features = datasets.Features( + { + "question": datasets.Value("string"), + "question_id": datasets.Value("string"), + "summary": datasets.Value("string"), + "articles": [ + { + "answer_id": datasets.Value("string"), + "text": datasets.Value("string"), + "rating": datasets.Value("string"), + } + ], + } + ) + elif self.config.schema == "bigbio_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + file_path = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(file_path), + }, + ), + ] + + def _generate_examples(self, filepath) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + dataset = None + with open(filepath, "r", encoding="utf8") as infile: + dataset = json.load(infile) + + uid = it.count(0) + if self.config.name == "mediqa_ans_all_source": + dataset = self._json_dict_to_list(dataset, "id") + for example in dataset: + example["answers"] = self._json_dict_to_list(example["answers"], "id") + yield example["id"], example + else: + _, setting1, setting2, setting3 = self.config.subset_id.rsplit("_", 3) + if self.config.schema == "source": + for example in self._generate_setting_examples(dataset, setting1, setting2, setting3): + yield next(uid), example + elif self.config.schema == "bigbio_t2t": + for example in self._generate_setting_examples(dataset, setting1, setting2, setting3): + example = self._source_to_t2t(example) + example["id"] = next(uid) + yield example["id"], example + + def _generate_setting_examples(self, dataset, setting1, setting2, setting3): + for question_id, question in dataset.items(): + example = {} + example["question_id"] = question_id + example["question"] = question["question"] + if setting2 == "single": + for answer_id, answer in question["answers"].items(): + example_ = example.copy() + if setting1 == "section2answer": + example_["articles"] = [ + {"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]} + ] + elif setting1 == "page2answer": + example_["articles"] = [ + {"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]} + ] + if setting3 == "abstractive": + example_["summary"] = answer["answer_abs_summ"] + elif setting3 == "extractive": + example_["summary"] = answer["answer_ext_summ"] + yield example_ + elif setting2 == "multi": + example["articles"] = [] + for answer_id, answer in question["answers"].items(): + if setting1 == "section2answer": + example["articles"].append( + {"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]} + ) + elif setting1 == "page2answer": + example["articles"].append( + {"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]} + ) + + if setting3 == "abstractive": + example["summary"] = question["multi_abs_summ"] + elif setting3 == "extractive": + example["summary"] = question["multi_ext_summ"] + yield example + + def _source_to_t2t(self, example): + example_ = {} + example_["document_id"] = "" + example_["text_1_name"] = "" + example_["text_2_name"] = "" + + text1 = "" + text1 += "Question ID: " + example["question_id"] + "\n" + text1 += "Question: " + example["question"] + "\n" + for article in example["articles"]: + text1 += "Answer ID: " + article["answer_id"] + "\n" + text1 += "Answer: " + article["text"] + "\n" + text1 += "Rating: " + article["rating"] + "\n" + example_["text_1"] = text1 + + example_["text_2"] = example["summary"] + + return example_ + + def _json_dict_to_list(self, json, new_key): + list_ = [] + for key, values in json.items(): + assert isinstance(values, dict), "Child element is not a dict" + assert new_key not in values, "New key already in values" + values[new_key] = key + list_.append(values) + return list_