Skip to content

Commit d14cd7d

Browse files
committed
Add a RSS feed for the news page
This parses the HTML output from mkdocs and creates a feed file.
1 parent c572b49 commit d14cd7d

File tree

7 files changed

+192
-4
lines changed

7 files changed

+192
-4
lines changed

.github/workflows/main.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,9 @@ jobs:
4040
run: |
4141
poetry install
4242
43-
- run: poetry run mkdocs build
43+
- run: |
44+
poetry run mkdocs build
45+
poetry run python news2rss.py -f site/news/index.html -o site/news.xml
4446
4547
- name: Upload for pages
4648
uses: actions/upload-pages-artifact@v3

mkdocs.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ markdown_extensions:
9090
- pymdownx.critic
9191
- pymdownx.details
9292
- pymdownx.emoji:
93-
emoji_generator: !!python/name:pymdownx.emoji.to_svg
93+
emoji_index: !!python/name:material.extensions.emoji.twemoji
94+
emoji_generator: !!python/name:material.extensions.emoji.to_svg
9495
- pymdownx.inlinehilite
9596
- pymdownx.magiclink
9697
- pymdownx.mark

news2rss.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
#!#!/usr/bin/env python3
2+
# Converts the news page to an RSS feed
3+
4+
import re
5+
import argparse
6+
from bs4 import BeautifulSoup
7+
from datetime import datetime
8+
import xml.etree.ElementTree as ET
9+
from urllib.parse import urljoin
10+
11+
12+
def extract_canonical_url(soup):
13+
"""Extract the canonical URL from the HTML content"""
14+
15+
canonical_link = soup.find("link", rel="canonical")
16+
if canonical_link and canonical_link.get("href"):
17+
return canonical_link["href"]
18+
return None
19+
20+
21+
def make_urls_absolute(soup, base_url):
22+
"""Convert relative URLs to absolute URLs in the HTML content"""
23+
24+
# Handle links
25+
for link in soup.find_all("a", href=True):
26+
link["href"] = urljoin(base_url, link["href"])
27+
28+
# Handle images
29+
for img in soup.find_all("img", src=True):
30+
img["src"] = urljoin(base_url, img["src"])
31+
32+
return soup
33+
34+
35+
def html_to_rss(html_content, feed_title, feed_description):
36+
soup = BeautifulSoup(html_content, "html.parser")
37+
38+
# Extract page URL from canonical link
39+
page_url = extract_canonical_url(soup)
40+
if not page_url:
41+
raise ValueError("No canonical URL found in the HTML content")
42+
43+
print(f"Found canonical URL: {page_url}")
44+
45+
# Convert relative URLs to absolute
46+
soup = make_urls_absolute(soup, page_url)
47+
48+
# Create RSS root structure
49+
rss = ET.Element("rss", version="2.0")
50+
channel = ET.SubElement(rss, "channel")
51+
52+
# Channel metadata
53+
ET.SubElement(channel, "title").text = feed_title
54+
ET.SubElement(channel, "link").text = page_url
55+
ET.SubElement(channel, "description").text = feed_description
56+
ET.SubElement(channel, "language").text = "en-us"
57+
ET.SubElement(channel, "lastBuildDate").text = datetime.now().strftime(
58+
"%a, %d %b %Y %H:%M:%S +0000"
59+
)
60+
61+
# Find all h3 entries (news items)
62+
h3_tags = soup.find_all("h3", id=True)
63+
64+
for h3 in h3_tags:
65+
# Extract date and title from h3
66+
h3_id = h3.get("id", "")
67+
date_match = re.match(r"(\d{4}-\d{2}-\d{2})", h3_id)
68+
69+
if not date_match:
70+
print(f"Skipping h3 with id '{h3_id}' - no valid date found")
71+
continue
72+
73+
date_str = date_match.group(1)
74+
75+
# Get title text (remove any code tags for cleaner title)
76+
title_text = h3.get_text().strip()
77+
# Remove the date prefix from title
78+
title_clean = re.sub(r"^\d{4}-\d{2}-\d{2}\s*-\s*", "", title_text)
79+
80+
# Collect content paragraphs until next h3
81+
content_parts = []
82+
current = h3.next_sibling
83+
while current and current.name != "h3":
84+
content_parts.append(str(current))
85+
current = current.next_sibling
86+
content = "\n".join(content_parts)
87+
88+
# Create RSS item
89+
item = ET.SubElement(channel, "item")
90+
ET.SubElement(item, "title").text = title_clean
91+
92+
# Use the actual page URL for link and guid, not just the base URL
93+
item_url = f"{page_url}#{h3_id}"
94+
ET.SubElement(item, "link").text = item_url
95+
ET.SubElement(item, "guid").text = item_url
96+
97+
# Convert date to RFC 822 format
98+
date_obj = datetime.strptime(date_str, "%Y-%m-%d")
99+
rfc_date = date_obj.strftime("%a, %d %b %Y 00:00:00 +0000")
100+
ET.SubElement(item, "pubDate").text = rfc_date
101+
102+
# Add content as CDATA
103+
description = ET.SubElement(item, "description")
104+
description.text = content
105+
106+
return ET.tostring(rss, encoding="unicode", xml_declaration=True)
107+
108+
109+
def main():
110+
parser = argparse.ArgumentParser(description="Convert HTML page to RSS feed")
111+
parser.add_argument(
112+
"-f", "--file", help="Path to the HTML file to convert", required=True
113+
)
114+
parser.add_argument("-o", "--output", help="Output RSS file path", required=True)
115+
parser.add_argument("--title", help="RSS feed title", default="MSYS2 News")
116+
parser.add_argument(
117+
"--description",
118+
help="RSS feed description",
119+
default="MSYS2 project news and updates",
120+
)
121+
122+
args = parser.parse_args()
123+
print(f"Reading HTML from file {args.file}...")
124+
with open(args.file, "r", encoding="utf-8") as f:
125+
html_content = f.read()
126+
127+
print("Converting to RSS feed...")
128+
rss_content = html_to_rss(html_content, args.title, args.description)
129+
with open(args.output, "w", encoding="utf-8") as f:
130+
f.write(rss_content)
131+
132+
print(f"RSS feed saved to {args.output}")
133+
134+
135+
if __name__ == "__main__":
136+
main()

overrides/main.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,5 @@
3636
"@context": "https://schema.org"
3737
}
3838
</script>
39+
<link rel="alternate" type="application/rss+xml" title="RSS Feed (News)" href="{{ base_url }}/news.xml">
3940
{% endblock %}

poetry.lock

Lines changed: 48 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ package-mode = false
99
python = "^3.10"
1010
mkdocs = "^1.1.2"
1111
mkdocs-material = "^9.5"
12+
beautifulsoup4 = "^4.13.4"
1213

1314
[build-system]
1415
requires = ["poetry>=0.12"]

web/news.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ summary: Important events happening.
33
---
44
# News
55

6-
This page lists important changes or issues affecting MSYS2 users. We also post them to [Mastodon](https://fosstodon.org/@msys2org) / [Bluesky](https://bsky.app/profile/msys2org.bsky.social), including some not-so-important things :)
6+
This page lists important changes or issues affecting MSYS2 users. You can [:material-rss: subscribe via RSS](../news.xml). We also post them to [Mastodon](https://fosstodon.org/@msys2org) / [Bluesky](https://bsky.app/profile/msys2org.bsky.social), including some not-so-important things :)
77

88
### 2025-06-20 - Replacing `x86_64-pc-msys` with `x86_64-pc-cygwin`
99

0 commit comments

Comments
 (0)