From ca22ffbb3eebef8d9f8b851b7b9b60a4f73a484f Mon Sep 17 00:00:00 2001
From: Ben Sima <ben@bsima.me>
Date: Thu, 4 Sep 2025 09:34:30 -0400
Subject: Add Metadata Extraction for Podcast Queue

This commit adds support for extracting title and author metadata from URLs when
adding articles to the podcast queue. It includes database schema changes,
migration logic, and UI updates to display the extracted metadata.
---
 Biz/PodcastItLater/Web.py | 121 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

(limited to 'Biz/PodcastItLater/Web.py')

diff --git a/Biz/PodcastItLater/Web.py b/Biz/PodcastItLater/Web.py
index 86b2099..036dd45 100644
--- a/Biz/PodcastItLater/Web.py
+++ b/Biz/PodcastItLater/Web.py
@@ -17,6 +17,8 @@ Provides ludic + htmx interface and RSS feed generation.
 # : dep starlette
 import Biz.EmailAgent
 import Biz.PodcastItLater.Core as Core
+import html as html_module
+import httpx
 import ludic.catalog.layouts as layouts
 import ludic.catalog.pages as pages
 import ludic.html as html
@@ -94,6 +96,55 @@ RSS_CONFIG = {
 }
 
 
+def extract_og_metadata(url: str) -> tuple[str | None, str | None]:
+    """Extract Open Graph title and author from URL.
+
+    Returns:
+        tuple: (title, author) - both may be None if extraction fails
+    """
+    try:
+        # Use httpx to fetch the page with a timeout
+        response = httpx.get(url, timeout=10.0, follow_redirects=True)
+        response.raise_for_status()
+
+        # Simple regex-based extraction to avoid heavy dependencies
+        html_content = response.text
+
+        # Extract og:title
+        title_match = re.search(
+            r'<meta\s+(?:property|name)=["\']og:title["\']\s+content=["\'](.*?)["\']',
+            html_content,
+            re.IGNORECASE,
+        )
+        title = title_match.group(1) if title_match else None
+
+        # Extract author - try article:author first, then og:site_name
+        author_match = re.search(
+            r'<meta\s+(?:property|name)=["\']article:author["\']\s+content=["\'](.*?)["\']',
+            html_content,
+            re.IGNORECASE,
+        )
+        if not author_match:
+            author_match = re.search(
+                r'<meta\s+(?:property|name)=["\']og:site_name["\']\s+content=["\'](.*?)["\']',
+                html_content,
+                re.IGNORECASE,
+            )
+        author = author_match.group(1) if author_match else None
+
+        # Clean up HTML entities
+        if title:
+            title = html_module.unescape(title)
+        if author:
+            author = html_module.unescape(author)
+
+    except Exception as e:  # noqa: BLE001
+        logger.warning("Failed to extract metadata from %s: %s", url, e)
+        return None, None
+    else:
+        return title, author
+
+
 def send_magic_link(email: str, token: str) -> None:
     """Send magic link email to user."""
     subject = "Login to PodcastItLater"
@@ -271,6 +322,21 @@ class QueueStatus(Component[AnyChildren, QueueStatusAttrs]):
                         style={"color": status_color, "font-weight": "bold"},
                     ),
                     html.br(),
+                    # Add title and author if available
+                    *(
+                        [
+                            html.div(
+                                html.strong(item["title"]),
+                                html.br() if item.get("author") else "",
+                                html.small(f"by {item['author']}")
+                                if item.get("author")
+                                else "",
+                                style={"margin": "5px 0"},
+                            ),
+                        ]
+                        if item.get("title")
+                        else []
+                    ),
                     html.small(
                         item["url"][:URL_TRUNCATE_LENGTH]
                         + (
@@ -444,6 +510,13 @@ class AdminView(Component[AnyChildren, AdminViewAttrs]):
                                                         "text-align": "left",
                                                     },
                                                 ),
+                                                html.th(
+                                                    "Title",
+                                                    style={
+                                                        "padding": "10px",
+                                                        "text-align": "left",
+                                                    },
+                                                ),
                                                 html.th(
                                                     "Email",
                                                     style={
@@ -531,6 +604,49 @@ class AdminView(Component[AnyChildren, AdminViewAttrs]):
                                                             "padding": "10px",
                                                         },
                                                     ),
+                                                    html.td(
+                                                        html.div(
+                                                            item.get(
+                                                                "title",
+                                                                "-",
+                                                            )[
+                                                                :TITLE_TRUNCATE_LENGTH
+                                                            ]
+                                                            + (
+                                                                "..."
+                                                                if item.get(
+                                                                    "title",
+                                                                )
+                                                                and len(
+                                                                    item[
+                                                                        "title"
+                                                                    ],
+                                                                )
+                                                                > (
+                                                                    TITLE_TRUNCATE_LENGTH
+                                                                )
+                                                                else ""
+                                                            ),
+                                                            title=item.get(
+                                                                "title",
+                                                                "",
+                                                            ),
+                                                            style={
+                                                                "max-width": (
+                                                                    "200px"
+                                                                ),
+                                                                "overflow": (
+                                                                    "hidden"
+                                                                ),
+                                                                "text-overflow": (  # noqa: E501
+                                                                    "ellipsis"
+                                                                ),
+                                                            },
+                                                        ),
+                                                        style={
+                                                            "padding": "10px",
+                                                        },
+                                                    ),
                                                     html.td(
                                                         item["email"] or "-",
                                                         style={
@@ -1183,11 +1299,16 @@ def submit_article(request: Request, data: FormData) -> html.div:
                 style={"color": "#dc3545"},
             )
 
+        # Extract Open Graph metadata
+        title, author = extract_og_metadata(url)
+
         job_id = Core.Database.add_to_queue(
             url,
             user["email"],
             user_id,
             get_database_path(),
+            title=title,
+            author=author,
         )
         return html.div(
             f"✓ Article submitted successfully! Job ID: {job_id}",
-- 
cgit v1.2.3