diff options
Diffstat (limited to 'Biz/PodcastItLater/Web.py')
| -rw-r--r-- | Biz/PodcastItLater/Web.py | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/Biz/PodcastItLater/Web.py b/Biz/PodcastItLater/Web.py index 86b2099..036dd45 100644 --- a/Biz/PodcastItLater/Web.py +++ b/Biz/PodcastItLater/Web.py @@ -17,6 +17,8 @@ Provides ludic + htmx interface and RSS feed generation. # : dep starlette import Biz.EmailAgent import Biz.PodcastItLater.Core as Core +import html as html_module +import httpx import ludic.catalog.layouts as layouts import ludic.catalog.pages as pages import ludic.html as html @@ -94,6 +96,55 @@ RSS_CONFIG = { } +def extract_og_metadata(url: str) -> tuple[str | None, str | None]: + """Extract Open Graph title and author from URL. + + Returns: + tuple: (title, author) - both may be None if extraction fails + """ + try: + # Use httpx to fetch the page with a timeout + response = httpx.get(url, timeout=10.0, follow_redirects=True) + response.raise_for_status() + + # Simple regex-based extraction to avoid heavy dependencies + html_content = response.text + + # Extract og:title + title_match = re.search( + r'<meta\s+(?:property|name)=["\']og:title["\']\s+content=["\'](.*?)["\']', + html_content, + re.IGNORECASE, + ) + title = title_match.group(1) if title_match else None + + # Extract author - try article:author first, then og:site_name + author_match = re.search( + r'<meta\s+(?:property|name)=["\']article:author["\']\s+content=["\'](.*?)["\']', + html_content, + re.IGNORECASE, + ) + if not author_match: + author_match = re.search( + r'<meta\s+(?:property|name)=["\']og:site_name["\']\s+content=["\'](.*?)["\']', + html_content, + re.IGNORECASE, + ) + author = author_match.group(1) if author_match else None + + # Clean up HTML entities + if title: + title = html_module.unescape(title) + if author: + author = html_module.unescape(author) + + except Exception as e: # noqa: BLE001 + logger.warning("Failed to extract metadata from %s: %s", url, e) + return None, None + else: + return title, author + + def send_magic_link(email: str, token: str) -> None: """Send magic link email to user.""" subject = "Login to PodcastItLater" @@ -271,6 +322,21 @@ class QueueStatus(Component[AnyChildren, QueueStatusAttrs]): style={"color": status_color, "font-weight": "bold"}, ), html.br(), + # Add title and author if available + *( + [ + html.div( + html.strong(item["title"]), + html.br() if item.get("author") else "", + html.small(f"by {item['author']}") + if item.get("author") + else "", + style={"margin": "5px 0"}, + ), + ] + if item.get("title") + else [] + ), html.small( item["url"][:URL_TRUNCATE_LENGTH] + ( @@ -445,6 +511,13 @@ class AdminView(Component[AnyChildren, AdminViewAttrs]): }, ), html.th( + "Title", + style={ + "padding": "10px", + "text-align": "left", + }, + ), + html.th( "Email", style={ "padding": "10px", @@ -532,6 +605,49 @@ class AdminView(Component[AnyChildren, AdminViewAttrs]): }, ), html.td( + html.div( + item.get( + "title", + "-", + )[ + :TITLE_TRUNCATE_LENGTH + ] + + ( + "..." + if item.get( + "title", + ) + and len( + item[ + "title" + ], + ) + > ( + TITLE_TRUNCATE_LENGTH + ) + else "" + ), + title=item.get( + "title", + "", + ), + style={ + "max-width": ( + "200px" + ), + "overflow": ( + "hidden" + ), + "text-overflow": ( # noqa: E501 + "ellipsis" + ), + }, + ), + style={ + "padding": "10px", + }, + ), + html.td( item["email"] or "-", style={ "padding": "10px", @@ -1183,11 +1299,16 @@ def submit_article(request: Request, data: FormData) -> html.div: style={"color": "#dc3545"}, ) + # Extract Open Graph metadata + title, author = extract_og_metadata(url) + job_id = Core.Database.add_to_queue( url, user["email"], user_id, get_database_path(), + title=title, + author=author, ) return html.div( f"✓ Article submitted successfully! Job ID: {job_id}", |
