Skip to content

Commit 7724c1e

Browse files
authored
Utilities to parse RSS feeds and make markdown stubs (#301)
* Utilities to parse RSS feeds and make markdown stubs * move slugify util * Add new dependencies * add changelog notes * add click * reset index * Add some unit testing * Add "pytest-localserver" to hatch dev dependencies
1 parent bcb76bd commit 7724c1e

File tree

8 files changed

+237
-1
lines changed

8 files changed

+237
-1
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ See [GitHub releases](https://github.com/pyOpenSci/pyosMeta/releases) page for a
66

77
## [Unreleased]
88

9+
* RSS feed parser that will generate Markdown stub files (@banesullivan, #301)
10+
* Two new dependencies:
11+
* `feedparser`: a utility library for fetching and parsing RSS feeds. This saves us from having to write quite a lot of fetching/parsing logic.
12+
* `unidecode`: comes with a new utility function to easily slugify long title strings.
13+
* `click`: for adding arguments to command line scripts
14+
915
## [v1.7.3] - 2025-08-07
1016

1117
* Fix: gracefully fail when collecting repository metrics outside of GitHub (@banesullivan, #300)

pyproject.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,15 @@ classifiers = [
2424
"Programming Language :: Python :: 3.11",
2525
]
2626
dependencies = [
27+
"click",
28+
"feedparser",
2729
"pydantic>=2.0",
2830
"python-doi",
2931
"python-dotenv",
3032
"requests",
3133
"ruamel-yaml>=0.17.21",
3234
"tqdm",
35+
"unidecode"
3336
]
3437
# This is metadata that pip reads to understand what Python versions your package supports
3538
requires-python = ">=3.10"
@@ -43,6 +46,7 @@ dev = [
4346
"pre-commit",
4447
"pytest",
4548
"pytest-cov",
49+
"pytest-localserver",
4650
"pytest-mock",
4751
]
4852

@@ -58,6 +62,7 @@ parse-history = "pyosmeta.cli.parse_history:main"
5862
update-contributors = "pyosmeta.cli.update_contributors:main"
5963
update-reviews = "pyosmeta.cli.process_reviews:main"
6064
update-review-teams = "pyosmeta.cli.update_review_teams:main"
65+
fetch-rss-feed = "pyosmeta.cli.fetch_rss_feed:main"
6166

6267
[tool.coverage.run]
6368
branch = true
@@ -72,7 +77,7 @@ version.source = "vcs"
7277
build.hooks.vcs.version-file = "src/pyosmeta/_version.py"
7378

7479
[tool.hatch.envs.test]
75-
dependencies = ["pytest", "pytest-cov", "coverage[toml]", "pytest-mock"]
80+
dependencies = ["pytest", "pytest-cov", "coverage[toml]", "pytest-localserver", "pytest-mock"]
7681

7782
[tool.hatch.envs.test.scripts]
7883
run-coverage = "pytest --cov-config=pyproject.toml --cov=pyosmeta --cov=tests/*"

src/pyosmeta/cli/fetch_rss_feed.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import click
2+
3+
from pyosmeta.parse_rss import create_rss_feed_stubs
4+
5+
6+
@click.command()
7+
@click.argument("url")
8+
@click.argument("output_dir")
9+
def main(url: str, output_dir: str):
10+
"""Create markdown stubs from an RSS feed URL into a directory."""
11+
create_rss_feed_stubs(url, output_dir)
12+
13+
14+
if __name__ == "__main__":
15+
main()

src/pyosmeta/parse_rss.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from pathlib import Path
2+
3+
import feedparser
4+
5+
from .utils_clean import slugify
6+
7+
8+
def parse_rss_feed(url: str) -> list[dict]:
9+
"""Fetch and parse an RSS feed from a URL."""
10+
parsed_feed = feedparser.parse(url)
11+
return [
12+
{key: entry.get(key) for key in entry.keys()}
13+
for entry in parsed_feed.entries
14+
]
15+
16+
17+
def make_md_stub(index: int, title: str, summary: str, link: str) -> str:
18+
"""Create a Markdown stub for an entry."""
19+
return f'''
20+
---
21+
title: "{index}. {title}"
22+
excerpt: "
23+
{summary}"
24+
link: {link}
25+
btn_label: View Tutorial
26+
btn_class: btn--success btn--large
27+
---
28+
'''
29+
30+
31+
def fetch_rss_feed_as_stubs(url: str) -> dict[str, str]:
32+
"""Fetch an RSS feed and return a dictionary of Markdown stubs.
33+
34+
The keys of the dictionary are filenames, and the values are the Markdown content.
35+
"""
36+
items = parse_rss_feed(url)
37+
38+
stubs = {}
39+
for i, item in enumerate(items):
40+
title = item.get("title", None)
41+
if not title:
42+
# WARN
43+
continue
44+
filename = f"{i:02d}-{slugify(title)}.md"
45+
content = make_md_stub(
46+
index=i,
47+
title=title,
48+
summary=item.get("summary", ""),
49+
link=item.get("link", "#"),
50+
)
51+
stubs[filename] = content
52+
return stubs
53+
54+
55+
def create_rss_feed_stubs(url: str, output_dir: str) -> None:
56+
"""Create markdown stubs from an RSS feed URL into a directory."""
57+
stubs = fetch_rss_feed_as_stubs(url)
58+
for filename, content in stubs.items():
59+
# TODO: should we wipe existing files?
60+
path = Path(output_dir) / filename
61+
path.parent.mkdir(parents=True, exist_ok=True)
62+
with open(path, "w") as f:
63+
f.write(content)

src/pyosmeta/utils_clean.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import doi
1111
import requests
12+
import unidecode
1213

1314
from .logging import logger
1415

@@ -213,3 +214,9 @@ def clean_archive(archive):
213214
return None
214215
else:
215216
raise ValueError(f"Invalid archive URL: {archive}")
217+
218+
219+
def slugify(text: str) -> str:
220+
"""Convert a long title/text into a slug suitable for filenames/URLs."""
221+
text = unidecode.unidecode(text).lower()
222+
return re.sub(r"[\W_]+", "-", text)

tests/conftest.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,3 +175,11 @@ def _data_file(
175175
return path
176176

177177
return _data_file
178+
179+
180+
@pytest.fixture
181+
def rss_feed_url(httpserver):
182+
"""Serve a local RSS feed for testing."""
183+
path = DATA_DIR / "tutorials.rss"
184+
httpserver.serve_content(path.read_text())
185+
return httpserver.url

tests/data/tutorials.rss

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
<!-- https://www.pyopensci.org/python-package-guide/tutorials.rss -->
2+
<?xml version='1.0' encoding='UTF-8'?>
3+
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
4+
<channel>
5+
<title>pyOpenSci Tutorials</title>
6+
<link>https://www.pyopensci.org/python-package-guide/tutorials/intro.html</link>
7+
<atom:link href="https://www.pyopensci.org/python-package-guide/tutorials.rss" rel="self"/>
8+
<description>A tutorial feed that lists metadata for the pyOpenSci Python packaging tutorials so we can automatically list them on our website.</description>
9+
<language>en</language>
10+
<lastBuildDate>Mon, 11 Aug 2025 21:09:23 GMT</lastBuildDate>
11+
<item>
12+
<title>Add a License and Code of Conduct to your python package</title>
13+
<link>https://www.pyopensci.org/python-package-guide/tutorials/add-license-coc.html</link>
14+
<description>Learn how to add a LICENSE and CODE_OF_CONDUCT file to your Python package. This lesson covers choosing a permissive license, placing key files for visibility on GitHub and PyPI, and adopting the Contributor Covenant to support an inclusive community.</description>
15+
<author>pyOpenSci</author>
16+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/add-license-coc.html</guid>
17+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
18+
</item>
19+
<item>
20+
<title>Add a README file to your Python package</title>
21+
<link>https://www.pyopensci.org/python-package-guide/tutorials/add-readme.html</link>
22+
<description>Learn how to create a clear, effective README file for your Python package. This lesson covers what to include, why each section matters, and how a well-structured README improves usability and discoverability on GitHub and PyPI.</description>
23+
<author>pyOpenSci</author>
24+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/add-readme.html</guid>
25+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
26+
</item>
27+
<item>
28+
<title>Command Line Reference Guide</title>
29+
<link>https://www.pyopensci.org/python-package-guide/tutorials/command-line-reference.html</link>
30+
<description>Learn how to add a command-line interface (CLI) to your Python package using the argparse library. This lesson walks you through creating a CLI entry point so users can run your package directly from the terminal.</description>
31+
<author>pyOpenSci</author>
32+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/command-line-reference.html</guid>
33+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
34+
</item>
35+
<item>
36+
<title>Create a Python package from scratch, a beginner-friendly tutorial</title>
37+
<link>https://www.pyopensci.org/python-package-guide/tutorials/create-python-package.html</link>
38+
<description>Learn how to create a Python package and make your code installable using Hatch. This tutorial walks you through structuring your code and configuring a pyproject.toml so others can easily install and use your package.</description>
39+
<author>pyOpenSci</author>
40+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/create-python-package.html</guid>
41+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
42+
</item>
43+
<item>
44+
<title>Use Hatch environments with your Python package: a beginner-friendly tutorial</title>
45+
<link>https://www.pyopensci.org/python-package-guide/tutorials/develop-python-package-hatch.html</link>
46+
<description>The pyOpenSci pure Python package template uses Hatch to manage environments and run tests, docs, and other maintenance steps. Learn how to use Hatch environments to manage your Python package.</description>
47+
<author>pyOpenSci</author>
48+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/develop-python-package-hatch.html</guid>
49+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
50+
</item>
51+
<item>
52+
<title>Get to Know Hatch</title>
53+
<link>https://www.pyopensci.org/python-package-guide/tutorials/get-to-know-hatch.html</link>
54+
<description>Get started with Hatch, a modern Python packaging tool. This lesson introduces Hatch’s features and shows how it simplifies environment management, project scaffolding, and building your package.</description>
55+
<author>pyOpenSci</author>
56+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/get-to-know-hatch.html</guid>
57+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
58+
</item>
59+
<item>
60+
<title>Python packaging 101</title>
61+
<link>https://www.pyopensci.org/python-package-guide/tutorials/intro.html</link>
62+
<description>This page outlines the key steps to create, document, and share a high-quality scientific Python package. Here you will also get an overview of the pyOpenSci packaging guide and what you’ll learn.</description>
63+
<author>pyOpenSci</author>
64+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/intro.html</guid>
65+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
66+
</item>
67+
<item>
68+
<title>Publish your Python package that is on PyPI to conda-forge</title>
69+
<link>https://www.pyopensci.org/python-package-guide/tutorials/publish-conda-forge.html</link>
70+
<description>Learn how to publish your Python package on conda-forge to make it easily installable with conda. This lesson covers the submission process, metadata requirements, and maintaining your feedstock.</description>
71+
<author>pyOpenSci</author>
72+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/publish-conda-forge.html</guid>
73+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
74+
</item>
75+
<item>
76+
<title>Publish your Python package to PyPI</title>
77+
<link>https://www.pyopensci.org/python-package-guide/tutorials/publish-pypi.html</link>
78+
<description>Learn how to publish your Python package on PyPI so others can install it using pip. This lesson covers building your package, creating a PyPI account, and uploading your distribution files.</description>
79+
<author>pyOpenSci</author>
80+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/publish-pypi.html</guid>
81+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
82+
</item>
83+
<item>
84+
<title>Make your Python package PyPI ready - pyproject.toml</title>
85+
<link>https://www.pyopensci.org/python-package-guide/tutorials/pyproject-toml.html</link>
86+
<description>The pyproject.toml file is the central configuration file for building and packaging Python projects. This lesson explains key sections like name, version, dependencies, and how they support packaging and distribution. You’ll learn how to set up this file to ensure your package is ready for publishing.</description>
87+
<author>pyOpenSci</author>
88+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/pyproject-toml.html</guid>
89+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
90+
</item>
91+
<item>
92+
<title>Using Hatch to Migrate setup.py to a pyproject.toml</title>
93+
<link>https://www.pyopensci.org/python-package-guide/tutorials/setup-py-to-pyproject-toml.html</link>
94+
<description>If you’re creating a pure Python project, pyproject.toml is preferred over setup.py for packaging and configuration. Learn how to migrate from the older setup.py format to the modern pyproject.toml file. This lesson walks you through updating your package metadata and build settings to align with current Python packaging standards.</description>
95+
<author>pyOpenSci</author>
96+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/setup-py-to-pyproject-toml.html</guid>
97+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
98+
</item>
99+
<item>
100+
<title>Setup Trusted Publishing for secure and automated publishing via GitHub Actions</title>
101+
<link>https://www.pyopensci.org/python-package-guide/tutorials/trusted-publishing.html</link>
102+
<description>Learn how to publish your Python package automatically via GitHub Actions. This lesson also covers how to do publishing in a secure way by using Trusted Publishing.</description>
103+
<author>pyOpenSci</author>
104+
<guid isPermaLink="true">https://www.pyopensci.org/python-package-guide/tutorials/trusted-publishing.html</guid>
105+
<pubDate>Mon, 11 Aug 2025 21:07:32 GMT</pubDate>
106+
</item>
107+
</channel>
108+
</rss>

tests/unit/test_parse_rss.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
from pyosmeta.parse_rss import fetch_rss_feed_as_stubs, parse_rss_feed
2+
3+
4+
def test_rss_feed_parse(rss_feed_url):
5+
feed = parse_rss_feed(rss_feed_url)
6+
assert feed is not None
7+
assert len(feed) > 0
8+
for entry in feed:
9+
assert isinstance(entry, dict)
10+
assert "title" in entry
11+
assert "link" in entry
12+
assert "summary" in entry
13+
14+
15+
def test_fetch_rss_feed_as_stubs(rss_feed_url):
16+
stubs = fetch_rss_feed_as_stubs(rss_feed_url)
17+
assert isinstance(stubs, dict)
18+
assert len(stubs) > 0
19+
for filename, content in stubs.items():
20+
assert filename.endswith(".md")
21+
assert isinstance(content, str)
22+
assert "title:" in content
23+
assert "excerpt:" in content
24+
assert "link:" in content

0 commit comments

Comments
 (0)