Add Metadata Extraction for Podcast Queue

This commit adds support for extracting title and author metadata from URLs when adding articles to the podcast queue. It includes database schema changes, migration logic, and UI updates to display the extracted metadata.
author: Ben Sima <ben@bsima.me> 2025-09-04 09:34:30 -0400
committer: Ben Sima <ben@bsima.me> 2025-09-04 09:34:30 -0400
commit: ca22ffbb3eebef8d9f8b851b7b9b60a4f73a484f (patch)
tree: 21c4583ab1535886194998bd0c184bb628e0c56b
parent: 48fdb6610957d213739f2cc84bc4c9071be909ac (diff)
2 files changed, 162 insertions, 9 deletions
diff --git a/Biz/PodcastItLater/Core.py b/Biz/PodcastItLater/Core.py
index 6c04db8..86676b5 100644
--- a/Biz/PodcastItLater/Core.py
+++ b/Biz/PodcastItLater/Core.py
@@ -82,7 +82,9 @@ class Database:  # noqa: PLR0904
                     status TEXT DEFAULT 'pending',
                     retry_count INTEGER DEFAULT 0,
                     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                    error_message TEXT
+                    error_message TEXT,
+                    title TEXT,
+                    author TEXT
                 )
             """)
 
@@ -117,14 +119,19 @@ class Database:  # noqa: PLR0904
         # Run migration to add user support
         Database.migrate_to_multi_user(db_path)
 
+        # Run migration to add metadata fields
+        Database.migrate_add_metadata_fields(db_path)
+
     @staticmethod
-    def add_to_queue(
+    def add_to_queue(  # noqa: PLR0913, PLR0917
         url: str,
         email: str,
         user_id: int,
         db_path: str | None = None,
+        title: str | None = None,
+        author: str | None = None,
     ) -> int:
-        """Insert new job into queue, return job ID.
+        """Insert new job into queue with metadata, return job ID.
 
         Raises:
             ValueError: If job ID cannot be retrieved after insert.
@@ -134,8 +141,9 @@ class Database:  # noqa: PLR0904
         with Database.get_connection(db_path) as conn:
             cursor = conn.cursor()
             cursor.execute(
-                "INSERT INTO queue (url, email, user_id) VALUES (?, ?, ?)",
-                (url, email, user_id),
+                "INSERT INTO queue (url, email, user_id, title, author) "
+                "VALUES (?, ?, ?, ?, ?)",
+                (url, email, user_id, title, author),
             )
             conn.commit()
             job_id = cursor.lastrowid
@@ -284,7 +292,8 @@ class Database:  # noqa: PLR0904
         with Database.get_connection(db_path) as conn:
             cursor = conn.cursor()
             cursor.execute("""
-                SELECT id, url, email, status, created_at, error_message
+                SELECT id, url, email, status, created_at, error_message,
+                       title, author
                 FROM queue
                 WHERE status IN ('pending', 'processing', 'error')
                 ORDER BY created_at DESC
@@ -382,7 +391,7 @@ class Database:  # noqa: PLR0904
                 cursor.execute(
                     """
                     SELECT id, url, email, status, retry_count, created_at,
-                           error_message
+                           error_message, title, author
                     FROM queue
                     WHERE user_id = ?
                     ORDER BY created_at DESC
@@ -392,7 +401,7 @@ class Database:  # noqa: PLR0904
             else:
                 cursor.execute("""
                     SELECT id, url, email, status, retry_count, created_at,
-                           error_message
+                           error_message, title, author
                     FROM queue
                     ORDER BY created_at DESC
                 """)
@@ -490,6 +499,28 @@ class Database:  # noqa: PLR0904
             logger.info("Database migrated to support multiple users")
 
     @staticmethod
+    def migrate_add_metadata_fields(db_path: str | None = None) -> None:
+        """Add title and author fields to queue table."""
+        if db_path is None:
+            db_path = Database.get_default_db_path()
+        with Database.get_connection(db_path) as conn:
+            cursor = conn.cursor()
+
+            # Check if columns already exist
+            cursor.execute("PRAGMA table_info(queue)")
+            queue_info = cursor.fetchall()
+            queue_columns = [col[1] for col in queue_info]
+
+            if "title" not in queue_columns:
+                cursor.execute("ALTER TABLE queue ADD COLUMN title TEXT")
+
+            if "author" not in queue_columns:
+                cursor.execute("ALTER TABLE queue ADD COLUMN author TEXT")
+
+            conn.commit()
+            logger.info("Database migrated to support metadata fields")
+
+    @staticmethod
     def create_user(email: str, db_path: str | None = None) -> tuple[int, str]:
         """Create a new user and return (user_id, token).
 
@@ -583,7 +614,8 @@ class Database:  # noqa: PLR0904
             cursor = conn.cursor()
             cursor.execute(
                 """
-                SELECT id, url, email, status, created_at, error_message
+                SELECT id, url, email, status, created_at, error_message,
+                       title, author
                 FROM queue
                 WHERE user_id = ? AND
                       status IN ('pending', 'processing', 'error')
diff --git a/Biz/PodcastItLater/Web.py b/Biz/PodcastItLater/Web.py
index 86b2099..036dd45 100644
--- a/Biz/PodcastItLater/Web.py
+++ b/Biz/PodcastItLater/Web.py
@@ -17,6 +17,8 @@ Provides ludic + htmx interface and RSS feed generation.
 # : dep starlette
 import Biz.EmailAgent
 import Biz.PodcastItLater.Core as Core
+import html as html_module
+import httpx
 import ludic.catalog.layouts as layouts
 import ludic.catalog.pages as pages
 import ludic.html as html
@@ -94,6 +96,55 @@ RSS_CONFIG = {
 }
 
 
+def extract_og_metadata(url: str) -> tuple[str | None, str | None]:
+    """Extract Open Graph title and author from URL.
+
+    Returns:
+        tuple: (title, author) - both may be None if extraction fails
+    """
+    try:
+        # Use httpx to fetch the page with a timeout
+        response = httpx.get(url, timeout=10.0, follow_redirects=True)
+        response.raise_for_status()
+
+        # Simple regex-based extraction to avoid heavy dependencies
+        html_content = response.text
+
+        # Extract og:title
+        title_match = re.search(
+            r'<meta\s+(?:property|name)=["\']og:title["\']\s+content=["\'](.*?)["\']',
+            html_content,
+            re.IGNORECASE,
+        )
+        title = title_match.group(1) if title_match else None
+
+        # Extract author - try article:author first, then og:site_name
+        author_match = re.search(
+            r'<meta\s+(?:property|name)=["\']article:author["\']\s+content=["\'](.*?)["\']',
+            html_content,
+            re.IGNORECASE,
+        )
+        if not author_match:
+            author_match = re.search(
+                r'<meta\s+(?:property|name)=["\']og:site_name["\']\s+content=["\'](.*?)["\']',
+                html_content,
+                re.IGNORECASE,
+            )
+        author = author_match.group(1) if author_match else None
+
+        # Clean up HTML entities
+        if title:
+            title = html_module.unescape(title)
+        if author:
+            author = html_module.unescape(author)
+
+    except Exception as e:  # noqa: BLE001
+        logger.warning("Failed to extract metadata from %s: %s", url, e)
+        return None, None
+    else:
+        return title, author
+
+
 def send_magic_link(email: str, token: str) -> None:
     """Send magic link email to user."""
     subject = "Login to PodcastItLater"
@@ -271,6 +322,21 @@ class QueueStatus(Component[AnyChildren, QueueStatusAttrs]):
                         style={"color": status_color, "font-weight": "bold"},
                     ),
                     html.br(),
+                    # Add title and author if available
+                    *(
+                        [
+                            html.div(
+                                html.strong(item["title"]),
+                                html.br() if item.get("author") else "",
+                                html.small(f"by {item['author']}")
+                                if item.get("author")
+                                else "",
+                                style={"margin": "5px 0"},
+                            ),
+                        ]
+                        if item.get("title")
+                        else []
+                    ),
                     html.small(
                         item["url"][:URL_TRUNCATE_LENGTH]
                         + (
@@ -445,6 +511,13 @@ class AdminView(Component[AnyChildren, AdminViewAttrs]):
                                                     },
                                                 ),
                                                 html.th(
+                                                    "Title",
+                                                    style={
+                                                        "padding": "10px",
+                                                        "text-align": "left",
+                                                    },
+                                                ),
+                                                html.th(
                                                     "Email",
                                                     style={
                                                         "padding": "10px",
@@ -532,6 +605,49 @@ class AdminView(Component[AnyChildren, AdminViewAttrs]):
                                                         },
                                                     ),
                                                     html.td(
+                                                        html.div(
+                                                            item.get(
+                                                                "title",
+                                                                "-",
+                                                            )[
+                                                                :TITLE_TRUNCATE_LENGTH
+                                                            ]
+                                                            + (
+                                                                "..."
+                                                                if item.get(
+                                                                    "title",
+                                                                )
+                                                                and len(
+                                                                    item[
+                                                                        "title"
+                                                                    ],
+                                                                )
+                                                                > (
+                                                                    TITLE_TRUNCATE_LENGTH
+                                                                )
+                                                                else ""
+                                                            ),
+                                                            title=item.get(
+                                                                "title",
+                                                                "",
+                                                            ),
+                                                            style={
+                                                                "max-width": (
+                                                                    "200px"
+                                                                ),
+                                                                "overflow": (
+                                                                    "hidden"
+                                                                ),
+                                                                "text-overflow": (  # noqa: E501
+                                                                    "ellipsis"
+                                                                ),
+                                                            },
+                                                        ),
+                                                        style={
+                                                            "padding": "10px",
+                                                        },
+                                                    ),
+                                                    html.td(
                                                         item["email"] or "-",
                                                         style={
                                                             "padding": "10px",
@@ -1183,11 +1299,16 @@ def submit_article(request: Request, data: FormData) -> html.div:
                 style={"color": "#dc3545"},
             )
 
+        # Extract Open Graph metadata
+        title, author = extract_og_metadata(url)
+
         job_id = Core.Database.add_to_queue(
             url,
             user["email"],
             user_id,
             get_database_path(),
+            title=title,
+            author=author,
         )
         return html.div(
             f"✓ Article submitted successfully! Job ID: {job_id}",
author	Ben Sima <ben@bsima.me>	2025-09-04 09:34:30 -0400
committer	Ben Sima <ben@bsima.me>	2025-09-04 09:34:30 -0400
commit	ca22ffbb3eebef8d9f8b851b7b9b60a4f73a484f (patch)
tree	21c4583ab1535886194998bd0c184bb628e0c56b
parent	48fdb6610957d213739f2cc84bc4c9071be909ac (diff)