summaryrefslogtreecommitdiff
path: root/Biz/PodcastItLater/Web.py
diff options
context:
space:
mode:
Diffstat (limited to 'Biz/PodcastItLater/Web.py')
-rw-r--r--Biz/PodcastItLater/Web.py121
1 files changed, 121 insertions, 0 deletions
diff --git a/Biz/PodcastItLater/Web.py b/Biz/PodcastItLater/Web.py
index 86b2099..036dd45 100644
--- a/Biz/PodcastItLater/Web.py
+++ b/Biz/PodcastItLater/Web.py
@@ -17,6 +17,8 @@ Provides ludic + htmx interface and RSS feed generation.
# : dep starlette
import Biz.EmailAgent
import Biz.PodcastItLater.Core as Core
+import html as html_module
+import httpx
import ludic.catalog.layouts as layouts
import ludic.catalog.pages as pages
import ludic.html as html
@@ -94,6 +96,55 @@ RSS_CONFIG = {
}
+def extract_og_metadata(url: str) -> tuple[str | None, str | None]:
+ """Extract Open Graph title and author from URL.
+
+ Returns:
+ tuple: (title, author) - both may be None if extraction fails
+ """
+ try:
+ # Use httpx to fetch the page with a timeout
+ response = httpx.get(url, timeout=10.0, follow_redirects=True)
+ response.raise_for_status()
+
+ # Simple regex-based extraction to avoid heavy dependencies
+ html_content = response.text
+
+ # Extract og:title
+ title_match = re.search(
+ r'<meta\s+(?:property|name)=["\']og:title["\']\s+content=["\'](.*?)["\']',
+ html_content,
+ re.IGNORECASE,
+ )
+ title = title_match.group(1) if title_match else None
+
+ # Extract author - try article:author first, then og:site_name
+ author_match = re.search(
+ r'<meta\s+(?:property|name)=["\']article:author["\']\s+content=["\'](.*?)["\']',
+ html_content,
+ re.IGNORECASE,
+ )
+ if not author_match:
+ author_match = re.search(
+ r'<meta\s+(?:property|name)=["\']og:site_name["\']\s+content=["\'](.*?)["\']',
+ html_content,
+ re.IGNORECASE,
+ )
+ author = author_match.group(1) if author_match else None
+
+ # Clean up HTML entities
+ if title:
+ title = html_module.unescape(title)
+ if author:
+ author = html_module.unescape(author)
+
+ except Exception as e: # noqa: BLE001
+ logger.warning("Failed to extract metadata from %s: %s", url, e)
+ return None, None
+ else:
+ return title, author
+
+
def send_magic_link(email: str, token: str) -> None:
"""Send magic link email to user."""
subject = "Login to PodcastItLater"
@@ -271,6 +322,21 @@ class QueueStatus(Component[AnyChildren, QueueStatusAttrs]):
style={"color": status_color, "font-weight": "bold"},
),
html.br(),
+ # Add title and author if available
+ *(
+ [
+ html.div(
+ html.strong(item["title"]),
+ html.br() if item.get("author") else "",
+ html.small(f"by {item['author']}")
+ if item.get("author")
+ else "",
+ style={"margin": "5px 0"},
+ ),
+ ]
+ if item.get("title")
+ else []
+ ),
html.small(
item["url"][:URL_TRUNCATE_LENGTH]
+ (
@@ -445,6 +511,13 @@ class AdminView(Component[AnyChildren, AdminViewAttrs]):
},
),
html.th(
+ "Title",
+ style={
+ "padding": "10px",
+ "text-align": "left",
+ },
+ ),
+ html.th(
"Email",
style={
"padding": "10px",
@@ -532,6 +605,49 @@ class AdminView(Component[AnyChildren, AdminViewAttrs]):
},
),
html.td(
+ html.div(
+ item.get(
+ "title",
+ "-",
+ )[
+ :TITLE_TRUNCATE_LENGTH
+ ]
+ + (
+ "..."
+ if item.get(
+ "title",
+ )
+ and len(
+ item[
+ "title"
+ ],
+ )
+ > (
+ TITLE_TRUNCATE_LENGTH
+ )
+ else ""
+ ),
+ title=item.get(
+ "title",
+ "",
+ ),
+ style={
+ "max-width": (
+ "200px"
+ ),
+ "overflow": (
+ "hidden"
+ ),
+ "text-overflow": ( # noqa: E501
+ "ellipsis"
+ ),
+ },
+ ),
+ style={
+ "padding": "10px",
+ },
+ ),
+ html.td(
item["email"] or "-",
style={
"padding": "10px",
@@ -1183,11 +1299,16 @@ def submit_article(request: Request, data: FormData) -> html.div:
style={"color": "#dc3545"},
)
+ # Extract Open Graph metadata
+ title, author = extract_og_metadata(url)
+
job_id = Core.Database.add_to_queue(
url,
user["email"],
user_id,
get_database_path(),
+ title=title,
+ author=author,
)
return html.div(
f"✓ Article submitted successfully! Job ID: {job_id}",