Skip to content

Commit 7494c9b

Browse files
author
The TensorFlow Datasets Authors
committed
Block 2.0.0 version of the multi_news dataset as it contains broken URLs.
PiperOrigin-RevId: 797281061
1 parent 6cda07f commit 7494c9b

File tree

2 files changed

+13
-6
lines changed

2 files changed

+13
-6
lines changed

tensorflow_datasets/datasets/multi_news/multi_news_dataset_builder.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,16 @@ class Builder(tfds.core.GeneratorBasedBuilder):
3434
VERSION = tfds.core.Version("2.1.0")
3535
RELEASE_NOTES = {
3636
"1.0.0": "Initial release.",
37-
"2.0.0": "Update the dataset with valid URLs.",
38-
"2.1.0": "Update the dataset with cleaned URLs.",
37+
"2.0.0": "[Do not use] Update the dataset with valid URLs.",
38+
"2.1.0": (
39+
"Update the dataset with the correct URLs. The URLs in this version"
40+
" come from HuggingFace's dataset repo, which is curated by the same"
41+
" author: https://huggingface.co/datasets/alexfabbri/multi_news."
42+
),
3943
}
44+
BLOCKED_VERSIONS = tfds.core.utils.BlockedVersions(
45+
versions={"2.0.0": "The URLs of this version are invalid."}
46+
)
4047

4148
def _info(self) -> tfds.core.DatasetInfo:
4249
"""Returns the dataset metadata."""
@@ -77,9 +84,10 @@ def _generate_examples(self, src_file, tgt_file):
7784
).open() as tgt_f:
7885
for i, (src_line, tgt_line) in enumerate(zip(src_f, tgt_f)):
7986
yield i, {
80-
# In original file, each line has one example and natural newline
81-
# tokens "\n" are being replaced with "NEWLINE_CHAR". Here restore
82-
# the natural newline token to avoid special vocab "NEWLINE_CHAR".
87+
# In the original file, each line has one example and natural
88+
# newline tokens "\n" are being replaced with "NEWLINE_CHAR"
89+
# Here, we restore the natural newline token to avoid the special
90+
# vocab token "NEWLINE_CHAR".
8391
_DOCUMENT: src_line.strip().replace("NEWLINE_CHAR", "\n"),
8492
_SUMMARY: tgt_line.strip().lstrip(),
8593
}

tensorflow_datasets/url_checksums/multi_news.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)