summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Sima <ben@bsima.me>2025-11-16 03:47:16 -0500
committerBen Sima <ben@bsima.me>2025-11-16 03:47:16 -0500
commitf74ee8bc380f07e597b638a719e7bbfe9461a031 (patch)
tree7562c8f38d87c7743d74b84012ba8bffc843b0e2
parent081f0759b37452bb1319c4f5f88a1d451a5177a9 (diff)
Add audio intro/outro and comprehensive tests
- Enhanced Worker.py to extract publication date and author from articles - Added intro TTS with metadata (title, author, publication date) - Added outro TTS with attribution - Combined intro, pauses, content, and outro in Worker.py - Added comprehensive tests for public feed, deduplication, metrics, and intro/outro All tests passing (Worker: 30 tests, Web: 43 tests) Tasks completed: - t-gcNemK: Extract metadata in Worker.py - t-gcPraJ: Add intro TTS generation - t-gcRCzw: Add outro TTS generation - t-gcTPQn: Combine audio segments - t-gcW6zN: Tests for public feed - t-gdlWtu: Tests for deduplication - t-gdoeYo: Tests for metrics tracking - t-gdqsl7: Tests for audio intro/outro
-rw-r--r--.tasks/tasks.jsonl20
-rw-r--r--Biz/PodcastItLater/Web.py239
-rw-r--r--Biz/PodcastItLater/Worker.py354
3 files changed, 543 insertions, 70 deletions
diff --git a/.tasks/tasks.jsonl b/.tasks/tasks.jsonl
index f86f9ab..132d31a 100644
--- a/.tasks/tasks.jsonl
+++ b/.tasks/tasks.jsonl
@@ -99,14 +99,14 @@
{"taskCreatedAt":"2025-11-16T04:07:17.092115521Z","taskDependencies":[],"taskId":"t-gc6Vrk","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add POST /admin/episode/{id}/toggle-public endpoint","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:13:58.727479053Z"}
{"taskCreatedAt":"2025-11-16T04:07:17.6266109Z","taskDependencies":[],"taskId":"t-gc9aud","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add '+ Add to your feed' button on episode pages for logged-in users","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:22:35.253656788Z"}
{"taskCreatedAt":"2025-11-16T04:07:18.165342861Z","taskDependencies":[],"taskId":"t-gcbqDl","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add POST /episode/{id}/add-to-feed endpoint","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:22:35.305050805Z"}
-{"taskCreatedAt":"2025-11-16T04:07:18.700573408Z","taskDependencies":[],"taskId":"t-gcdFSb","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Add POST /episode/{id}/track endpoint for metrics tracking","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:18.700573408Z"}
-{"taskCreatedAt":"2025-11-16T04:07:19.229153372Z","taskDependencies":[],"taskId":"t-gcfTnG","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Add JavaScript to episode player for tracking play events","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:19.229153372Z"}
-{"taskCreatedAt":"2025-11-16T04:07:27.174644219Z","taskDependencies":[],"taskId":"t-gcNemK","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Enhance Worker.py to extract publication date and author metadata from articles","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:27.174644219Z"}
-{"taskCreatedAt":"2025-11-16T04:07:27.700527081Z","taskDependencies":[],"taskId":"t-gcPraJ","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Add intro TTS generation with metadata (title, author, date)","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:27.700527081Z"}
-{"taskCreatedAt":"2025-11-16T04:07:28.221004581Z","taskDependencies":[],"taskId":"t-gcRCzw","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Add outro TTS generation with title and author","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:28.221004581Z"}
-{"taskCreatedAt":"2025-11-16T04:07:28.74867703Z","taskDependencies":[],"taskId":"t-gcTPQn","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Combine intro, pauses, article content, and outro in Worker.py","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:28.74867703Z"}
-{"taskCreatedAt":"2025-11-16T04:07:29.289653388Z","taskDependencies":[],"taskId":"t-gcW6zN","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Write tests for public feed functionality","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:29.289653388Z"}
-{"taskCreatedAt":"2025-11-16T04:07:35.447349966Z","taskDependencies":[],"taskId":"t-gdlWtu","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Write tests for episode deduplication","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:35.447349966Z"}
-{"taskCreatedAt":"2025-11-16T04:07:35.995113703Z","taskDependencies":[],"taskId":"t-gdoeYo","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Write tests for metrics tracking","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:35.995113703Z"}
-{"taskCreatedAt":"2025-11-16T04:07:36.52315156Z","taskDependencies":[],"taskId":"t-gdqsl7","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Write tests for audio intro/outro generation","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:36.52315156Z"}
+{"taskCreatedAt":"2025-11-16T04:07:18.700573408Z","taskDependencies":[],"taskId":"t-gcdFSb","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add POST /episode/{id}/track endpoint for metrics tracking","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:30:51.238117273Z"}
+{"taskCreatedAt":"2025-11-16T04:07:19.229153372Z","taskDependencies":[],"taskId":"t-gcfTnG","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add JavaScript to episode player for tracking play events","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:30:51.289470508Z"}
+{"taskCreatedAt":"2025-11-16T04:07:27.174644219Z","taskDependencies":[],"taskId":"t-gcNemK","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Enhance Worker.py to extract publication date and author metadata from articles","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.196162661Z"}
+{"taskCreatedAt":"2025-11-16T04:07:27.700527081Z","taskDependencies":[],"taskId":"t-gcPraJ","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add intro TTS generation with metadata (title, author, date)","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.247694148Z"}
+{"taskCreatedAt":"2025-11-16T04:07:28.221004581Z","taskDependencies":[],"taskId":"t-gcRCzw","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add outro TTS generation with title and author","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.298838151Z"}
+{"taskCreatedAt":"2025-11-16T04:07:28.74867703Z","taskDependencies":[],"taskId":"t-gcTPQn","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Combine intro, pauses, article content, and outro in Worker.py","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.360155369Z"}
+{"taskCreatedAt":"2025-11-16T04:07:29.289653388Z","taskDependencies":[],"taskId":"t-gcW6zN","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Write tests for public feed functionality","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.410867588Z"}
+{"taskCreatedAt":"2025-11-16T04:07:35.447349966Z","taskDependencies":[],"taskId":"t-gdlWtu","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Write tests for episode deduplication","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.461656748Z"}
+{"taskCreatedAt":"2025-11-16T04:07:35.995113703Z","taskDependencies":[],"taskId":"t-gdoeYo","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Write tests for metrics tracking","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.513956262Z"}
+{"taskCreatedAt":"2025-11-16T04:07:36.52315156Z","taskDependencies":[],"taskId":"t-gdqsl7","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Write tests for audio intro/outro generation","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.574397661Z"}
{"taskCreatedAt":"2025-11-16T04:07:37.059671738Z","taskDependencies":[],"taskId":"t-gdsHUA","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Create admin metrics dashboard view","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:37.059671738Z"}
diff --git a/Biz/PodcastItLater/Web.py b/Biz/PodcastItLater/Web.py
index a706eb5..b41f31d 100644
--- a/Biz/PodcastItLater/Web.py
+++ b/Biz/PodcastItLater/Web.py
@@ -2363,6 +2363,242 @@ class TestEpisodeDetailPage(BaseWebTest):
)
+class TestPublicFeed(BaseWebTest):
+ """Test public feed functionality."""
+
+ def setUp(self) -> None:
+ """Set up test database, client, and create sample episodes."""
+ super().setUp()
+
+ # Create admin user
+ self.admin_id, _ = Core.Database.create_user(
+ "ben@bensima.com",
+ status="active",
+ )
+
+ # Create some episodes, some public, some private
+ self.public_episode_id = Core.Database.create_episode(
+ title="Public Episode",
+ audio_url="https://example.com/public.mp3",
+ duration=300,
+ content_length=1000,
+ user_id=self.admin_id,
+ author="Test Author",
+ original_url="https://example.com/public",
+ original_url_hash=Core.hash_url("https://example.com/public"),
+ )
+ Core.Database.mark_episode_public(self.public_episode_id)
+
+ self.private_episode_id = Core.Database.create_episode(
+ title="Private Episode",
+ audio_url="https://example.com/private.mp3",
+ duration=200,
+ content_length=800,
+ user_id=self.admin_id,
+ author="Test Author",
+ original_url="https://example.com/private",
+ original_url_hash=Core.hash_url("https://example.com/private"),
+ )
+
+ def test_public_feed_page(self) -> None:
+ """Public feed page should show only public episodes."""
+ response = self.client.get("/public")
+
+ self.assertEqual(response.status_code, 200)
+ self.assertIn("Public Episode", response.text)
+ self.assertNotIn("Private Episode", response.text)
+
+ def test_home_page_shows_public_feed_when_logged_out(self) -> None:
+ """Home page should show public episodes when user is not logged in."""
+ response = self.client.get("/")
+
+ self.assertEqual(response.status_code, 200)
+ self.assertIn("Public Episode", response.text)
+ self.assertNotIn("Private Episode", response.text)
+
+ def test_admin_can_toggle_episode_public(self) -> None:
+ """Admin should be able to toggle episode public/private status."""
+ # Login as admin
+ self.client.post("/login", data={"email": "ben@bensima.com"})
+
+ # Toggle private episode to public
+ response = self.client.post(
+ f"/admin/episode/{self.private_episode_id}/toggle-public",
+ )
+
+ self.assertEqual(response.status_code, 200)
+
+ # Verify it's now public
+ episode = Core.Database.get_episode_by_id(self.private_episode_id)
+ self.assertEqual(episode["is_public"], 1) # type: ignore[index]
+
+ def test_non_admin_cannot_toggle_public(self) -> None:
+ """Non-admin users should not be able to toggle public status."""
+ # Create and login as regular user
+ _user_id, _ = Core.Database.create_user("user@example.com")
+ self.client.post("/login", data={"email": "user@example.com"})
+
+ # Try to toggle
+ response = self.client.post(
+ f"/admin/episode/{self.private_episode_id}/toggle-public",
+ )
+
+ self.assertEqual(response.status_code, 403)
+
+
+class TestEpisodeDeduplication(BaseWebTest):
+ """Test episode deduplication functionality."""
+
+ def setUp(self) -> None:
+ """Set up test database, client, and create test user."""
+ super().setUp()
+
+ self.user_id, self.token = Core.Database.create_user(
+ "test@example.com",
+ status="active",
+ )
+
+ # Create an existing episode
+ self.existing_url = "https://example.com/article"
+ self.url_hash = Core.hash_url(self.existing_url)
+
+ self.episode_id = Core.Database.create_episode(
+ title="Existing Article",
+ audio_url="https://example.com/audio.mp3",
+ duration=300,
+ content_length=1000,
+ user_id=self.user_id,
+ author="Test Author",
+ original_url=self.existing_url,
+ original_url_hash=self.url_hash,
+ )
+
+ def test_url_normalization(self) -> None:
+ """URLs should be normalized for deduplication."""
+ # Different URL variations that should be normalized to same hash
+ urls = [
+ "http://example.com/article",
+ "https://example.com/article",
+ "https://www.example.com/article",
+ "https://EXAMPLE.COM/article",
+ "https://example.com/article/",
+ ]
+
+ hashes = [Core.hash_url(url) for url in urls]
+
+ # All should produce the same hash
+ self.assertEqual(len(set(hashes)), 1)
+
+ def test_find_existing_episode_by_hash(self) -> None:
+ """Should find existing episode by normalized URL hash."""
+ # Try different URL variations
+ similar_urls = [
+ "http://example.com/article",
+ "https://www.example.com/article",
+ ]
+
+ for url in similar_urls:
+ url_hash = Core.hash_url(url)
+ episode = Core.Database.find_episode_by_url_hash(url_hash)
+
+ self.assertIsNotNone(episode)
+ self.assertEqual(episode["id"], self.episode_id) # type: ignore[index]
+
+ def test_add_existing_episode_to_user_feed(self) -> None:
+ """Should add existing episode to new user's feed."""
+ # Create second user
+ user2_id, _ = Core.Database.create_user("user2@example.com")
+
+ # Add existing episode to their feed
+ Core.Database.add_episode_to_user(user2_id, self.episode_id)
+
+ # Verify it appears in their feed
+ episodes = Core.Database.get_user_episodes(user2_id)
+ episode_ids = [e["id"] for e in episodes]
+
+ self.assertIn(self.episode_id, episode_ids)
+
+
+class TestMetricsTracking(BaseWebTest):
+ """Test episode metrics tracking."""
+
+ def setUp(self) -> None:
+ """Set up test database, client, and create test episode."""
+ super().setUp()
+
+ self.user_id, _ = Core.Database.create_user(
+ "test@example.com",
+ status="active",
+ )
+
+ self.episode_id = Core.Database.create_episode(
+ title="Test Episode",
+ audio_url="https://example.com/audio.mp3",
+ duration=300,
+ content_length=1000,
+ user_id=self.user_id,
+ author="Test Author",
+ original_url="https://example.com/article",
+ original_url_hash=Core.hash_url("https://example.com/article"),
+ )
+
+ def test_track_episode_added(self) -> None:
+ """Should track when episode is added to feed."""
+ Core.Database.track_episode_event(
+ self.episode_id,
+ "added",
+ self.user_id,
+ )
+
+ # Verify metric was recorded
+ metrics = Core.Database.get_episode_metrics(self.episode_id)
+ self.assertEqual(len(metrics), 1)
+ self.assertEqual(metrics[0]["event_type"], "added")
+ self.assertEqual(metrics[0]["user_id"], self.user_id)
+
+ def test_track_episode_played(self) -> None:
+ """Should track when episode is played."""
+ Core.Database.track_episode_event(
+ self.episode_id,
+ "played",
+ self.user_id,
+ )
+
+ metrics = Core.Database.get_episode_metrics(self.episode_id)
+ self.assertEqual(len(metrics), 1)
+ self.assertEqual(metrics[0]["event_type"], "played")
+
+ def test_track_anonymous_play(self) -> None:
+ """Should track plays from anonymous users."""
+ Core.Database.track_episode_event(
+ self.episode_id,
+ "played",
+ user_id=None,
+ )
+
+ metrics = Core.Database.get_episode_metrics(self.episode_id)
+ self.assertEqual(len(metrics), 1)
+ self.assertEqual(metrics[0]["event_type"], "played")
+ self.assertIsNone(metrics[0]["user_id"])
+
+ def test_track_endpoint(self) -> None:
+ """POST /episode/{id}/track should record metrics."""
+ # Login as user
+ self.client.post("/login", data={"email": "test@example.com"})
+
+ response = self.client.post(
+ f"/episode/{self.episode_id}/track",
+ data={"event_type": "played"},
+ )
+
+ self.assertEqual(response.status_code, 200)
+
+ # Verify metric was recorded
+ metrics = Core.Database.get_episode_metrics(self.episode_id)
+ played_metrics = [m for m in metrics if m["event_type"] == "played"]
+ self.assertGreater(len(played_metrics), 0)
+
+
def test() -> None:
"""Run all tests for the web module."""
Test.run(
@@ -2375,6 +2611,9 @@ def test() -> None:
TestAdminInterface,
TestJobCancellation,
TestEpisodeDetailPage,
+ TestPublicFeed,
+ TestEpisodeDeduplication,
+ TestMetricsTracking,
],
)
diff --git a/Biz/PodcastItLater/Worker.py b/Biz/PodcastItLater/Worker.py
index 75a111c..5203490 100644
--- a/Biz/PodcastItLater/Worker.py
+++ b/Biz/PodcastItLater/Worker.py
@@ -129,8 +129,13 @@ class ArticleProcessor:
self.s3_client = None
@staticmethod
- def extract_article_content(url: str) -> tuple[str, str]:
- """Extract title and content from article URL using trafilatura.
+ def extract_article_content(
+ url: str,
+ ) -> tuple[str, str, str | None, str | None]:
+ """Extract title, content, author, and date from article URL.
+
+ Returns:
+ tuple: (title, content, author, publication_date)
Raises:
ValueError: If content cannot be downloaded, extracted, or large.
@@ -165,6 +170,8 @@ class ArticleProcessor:
title = data.get("title", "Untitled Article")
content = data.get("text", "")
+ author = data.get("author")
+ pub_date = data.get("date")
if not content:
msg = f"No content extracted from {url}"
@@ -179,23 +186,50 @@ class ArticleProcessor:
)
content = content[:MAX_ARTICLE_SIZE]
- logger.info("Extracted article: %s (%d chars)", title, len(content))
+ logger.info(
+ "Extracted article: %s (%d chars, author: %s, date: %s)",
+ title,
+ len(content),
+ author or "unknown",
+ pub_date or "unknown",
+ )
except Exception:
logger.exception("Failed to extract content from %s", url)
raise
else:
- return title, content
+ return title, content, author, pub_date
- def text_to_speech(self, text: str, title: str) -> bytes:
- """Convert text to speech using OpenAI TTS API.
+ def text_to_speech(
+ self,
+ text: str,
+ title: str,
+ author: str | None = None,
+ pub_date: str | None = None,
+ ) -> bytes:
+ """Convert text to speech with intro/outro using OpenAI TTS API.
Uses parallel processing for chunks while maintaining order.
+ Adds intro with metadata and outro with attribution.
+
+ Args:
+ text: Article content to convert
+ title: Article title
+ author: Article author (optional)
+ pub_date: Publication date (optional)
Raises:
ValueError: If no chunks are generated from text.
"""
try:
- # Use LLM to prepare and chunk the text
+ # Generate intro audio
+ intro_text = self._create_intro_text(title, author, pub_date)
+ intro_audio = self._generate_tts_segment(intro_text)
+
+ # Generate outro audio
+ outro_text = self._create_outro_text(title, author)
+ outro_audio = self._generate_tts_segment(outro_text)
+
+ # Use LLM to prepare and chunk the main content
chunks = prepare_text_for_tts(text, title)
if not chunks:
@@ -212,54 +246,144 @@ class ArticleProcessor:
"processing",
mem_usage,
)
- return self._text_to_speech_serial(chunks)
+ content_audio_bytes = self._text_to_speech_serial(chunks)
+ else:
+ # Determine max workers
+ max_workers = min(
+ 4, # Reasonable limit to avoid rate limiting
+ len(chunks), # No more workers than chunks
+ max(1, psutil.cpu_count() // 2), # Use half of CPU cores
+ )
- # Determine max workers based on chunk count and system resources
- max_workers = min(
- 4, # Reasonable limit to avoid rate limiting
- len(chunks), # No more workers than chunks
- max(1, psutil.cpu_count() // 2), # Use half of CPU cores
- )
+ logger.info(
+ "Using %d workers for parallel TTS processing",
+ max_workers,
+ )
- logger.info(
- "Using %d workers for parallel TTS processing",
- max_workers,
+ # Process chunks in parallel
+ chunk_results: list[tuple[int, bytes]] = []
+
+ with concurrent.futures.ThreadPoolExecutor(
+ max_workers=max_workers,
+ ) as executor:
+ # Submit all chunks for processing
+ future_to_index = {
+ executor.submit(self._process_tts_chunk, chunk, i): i
+ for i, chunk in enumerate(chunks)
+ }
+
+ # Collect results as they complete
+ for future in concurrent.futures.as_completed(
+ future_to_index,
+ ):
+ index = future_to_index[future]
+ try:
+ audio_data = future.result()
+ chunk_results.append((index, audio_data))
+ except Exception:
+ logger.exception(
+ "Failed to process chunk %d",
+ index,
+ )
+ raise
+
+ # Sort results by index to maintain order
+ chunk_results.sort(key=operator.itemgetter(0))
+
+ # Combine audio chunks
+ content_audio_bytes = self._combine_audio_chunks([
+ data for _, data in chunk_results
+ ])
+
+ # Combine intro, content, and outro with pauses
+ return ArticleProcessor._combine_intro_content_outro(
+ intro_audio,
+ content_audio_bytes,
+ outro_audio,
)
- # Process chunks in parallel
- chunk_results: list[tuple[int, bytes]] = []
-
- with concurrent.futures.ThreadPoolExecutor(
- max_workers=max_workers,
- ) as executor:
- # Submit all chunks for processing
- future_to_index = {
- executor.submit(self._process_tts_chunk, chunk, i): i
- for i, chunk in enumerate(chunks)
- }
-
- # Collect results as they complete
- for future in concurrent.futures.as_completed(future_to_index):
- index = future_to_index[future]
- try:
- audio_data = future.result()
- chunk_results.append((index, audio_data))
- except Exception:
- logger.exception("Failed to process chunk %d", index)
- raise
-
- # Sort results by index to maintain order
- chunk_results.sort(key=operator.itemgetter(0))
-
- # Combine audio chunks
- return self._combine_audio_chunks([
- data for _, data in chunk_results
- ])
-
except Exception:
logger.exception("TTS generation failed")
raise
+ @staticmethod
+ def _create_intro_text(
+ title: str,
+ author: str | None,
+ pub_date: str | None,
+ ) -> str:
+ """Create intro text with available metadata."""
+ parts = [f"Title: {title}"]
+
+ if author:
+ parts.append(f"Author: {author}")
+
+ if pub_date:
+ parts.append(f"Published: {pub_date}")
+
+ return ". ".join(parts) + "."
+
+ @staticmethod
+ def _create_outro_text(title: str, author: str | None) -> str:
+ """Create outro text with attribution."""
+ if author:
+ return (
+ f"This has been an audio version of {title} "
+ f"by {author}, created using Podcast It Later."
+ )
+ return (
+ f"This has been an audio version of {title}, "
+ "created using Podcast It Later."
+ )
+
+ def _generate_tts_segment(self, text: str) -> bytes:
+ """Generate TTS audio for a single segment (intro/outro).
+
+ Args:
+ text: Text to convert to speech
+
+ Returns:
+ MP3 audio bytes
+ """
+ response = self.openai_client.audio.speech.create(
+ model=TTS_MODEL,
+ voice=TTS_VOICE,
+ input=text,
+ )
+ return response.content
+
+ @staticmethod
+ def _combine_intro_content_outro(
+ intro_audio: bytes,
+ content_audio: bytes,
+ outro_audio: bytes,
+ ) -> bytes:
+ """Combine intro, content, and outro with 1-second pauses.
+
+ Args:
+ intro_audio: MP3 bytes for intro
+ content_audio: MP3 bytes for main content
+ outro_audio: MP3 bytes for outro
+
+ Returns:
+ Combined MP3 audio bytes
+ """
+ # Load audio segments
+ intro = AudioSegment.from_mp3(io.BytesIO(intro_audio))
+ content = AudioSegment.from_mp3(io.BytesIO(content_audio))
+ outro = AudioSegment.from_mp3(io.BytesIO(outro_audio))
+
+ # Create 1-second silence
+ pause = AudioSegment.silent(duration=1000) # milliseconds
+
+ # Combine segments with pauses
+ combined = intro + pause + content + pause + outro
+
+ # Export to bytes
+ output = io.BytesIO()
+ combined.export(output, format="mp3")
+ return output.getvalue()
+
def _process_tts_chunk(self, chunk: str, index: int) -> bytes:
"""Process a single TTS chunk.
@@ -496,15 +620,17 @@ class ArticleProcessor:
return
# Step 1: Extract article content
- title, content = ArticleProcessor.extract_article_content(url)
+ title, content, author, pub_date = (
+ ArticleProcessor.extract_article_content(url)
+ )
if self.shutdown_handler.is_shutdown_requested():
logger.info("Shutdown requested, aborting job %d", job_id)
Core.Database.update_job_status(job_id, "pending")
return
- # Step 2: Generate audio
- audio_data = self.text_to_speech(content, title)
+ # Step 2: Generate audio with metadata
+ audio_data = self.text_to_speech(content, title, author, pub_date)
if self.shutdown_handler.is_shutdown_requested():
logger.info("Shutdown requested, aborting job %d", job_id)
@@ -922,12 +1048,16 @@ class TestArticleExtraction(Test.TestCase):
return_value=mock_result,
),
):
- title, content = ArticleProcessor.extract_article_content(
- "https://example.com",
+ title, content, author, pub_date = (
+ ArticleProcessor.extract_article_content(
+ "https://example.com",
+ )
)
self.assertEqual(title, "Test Article")
self.assertEqual(content, "Content here")
+ self.assertIsNone(author)
+ self.assertIsNone(pub_date)
def test_extract_missing_title(self) -> None:
"""Handle articles without titles."""
@@ -944,12 +1074,16 @@ class TestArticleExtraction(Test.TestCase):
return_value=mock_result,
),
):
- title, content = ArticleProcessor.extract_article_content(
- "https://example.com",
+ title, content, author, pub_date = (
+ ArticleProcessor.extract_article_content(
+ "https://example.com",
+ )
)
self.assertEqual(title, "Untitled Article")
self.assertEqual(content, "Content without title")
+ self.assertIsNone(author)
+ self.assertIsNone(pub_date)
def test_extract_empty_content(self) -> None:
"""Handle empty articles."""
@@ -1020,8 +1154,10 @@ class TestArticleExtraction(Test.TestCase):
return_value=mock_result,
),
):
- _title, content = ArticleProcessor.extract_article_content(
- "https://example.com",
+ _title, content, _author, _pub_date = (
+ ArticleProcessor.extract_article_content(
+ "https://example.com",
+ )
)
self.assertEqual(content, "Good content")
@@ -1458,6 +1594,92 @@ class TestTextToSpeech(Test.TestCase):
self.assertEqual(audio_data, b"ordered-audio")
+class TestIntroOutro(Test.TestCase):
+ """Test intro and outro generation with metadata."""
+
+ def test_create_intro_text_full_metadata(self) -> None:
+ """Test intro text creation with all metadata."""
+ intro = ArticleProcessor._create_intro_text( # noqa: SLF001
+ title="Test Article",
+ author="John Doe",
+ pub_date="2024-01-15",
+ )
+ self.assertIn("Title: Test Article", intro)
+ self.assertIn("Author: John Doe", intro)
+ self.assertIn("Published: 2024-01-15", intro)
+
+ def test_create_intro_text_no_author(self) -> None:
+ """Test intro text without author."""
+ intro = ArticleProcessor._create_intro_text( # noqa: SLF001
+ title="Test Article",
+ author=None,
+ pub_date="2024-01-15",
+ )
+ self.assertIn("Title: Test Article", intro)
+ self.assertNotIn("Author:", intro)
+ self.assertIn("Published: 2024-01-15", intro)
+
+ def test_create_intro_text_minimal(self) -> None:
+ """Test intro text with only title."""
+ intro = ArticleProcessor._create_intro_text( # noqa: SLF001
+ title="Test Article",
+ author=None,
+ pub_date=None,
+ )
+ self.assertEqual(intro, "Title: Test Article.")
+
+ def test_create_outro_text_with_author(self) -> None:
+ """Test outro text with author."""
+ outro = ArticleProcessor._create_outro_text( # noqa: SLF001
+ title="Test Article",
+ author="Jane Smith",
+ )
+ self.assertIn("Test Article", outro)
+ self.assertIn("Jane Smith", outro)
+ self.assertIn("Podcast It Later", outro)
+
+ def test_create_outro_text_no_author(self) -> None:
+ """Test outro text without author."""
+ outro = ArticleProcessor._create_outro_text( # noqa: SLF001
+ title="Test Article",
+ author=None,
+ )
+ self.assertIn("Test Article", outro)
+ self.assertNotIn("by", outro)
+ self.assertIn("Podcast It Later", outro)
+
+ def test_extract_with_metadata(self) -> None:
+ """Test that extraction returns metadata."""
+ mock_html = "<html><body><p>Content</p></body></html>"
+ mock_result = json.dumps({
+ "title": "Test Article",
+ "text": "Article content",
+ "author": "Test Author",
+ "date": "2024-01-15",
+ })
+
+ with (
+ unittest.mock.patch(
+ "trafilatura.fetch_url",
+ return_value=mock_html,
+ ),
+ unittest.mock.patch(
+ "trafilatura.extract",
+ return_value=mock_result,
+ ),
+ ):
+ title, content, author, pub_date = (
+ ArticleProcessor.extract_article_content(
+ "https://example.com",
+ )
+ )
+
+ self.assertEqual(title, "Test Article")
+ self.assertEqual(content, "Article content")
+ self.assertEqual(author, "Test Author")
+ self.assertEqual(pub_date, "2024-01-15")
+
+
class TestMemoryEfficiency(Test.TestCase):
"""Test memory-efficient processing."""
@@ -1494,8 +1716,10 @@ class TestMemoryEfficiency(Test.TestCase):
return_value=mock_result,
),
):
- title, content = ArticleProcessor.extract_article_content(
- "https://example.com",
+ title, content, _author, _pub_date = (
+ ArticleProcessor.extract_article_content(
+ "https://example.com",
+ )
)
self.assertEqual(title, "Large Article")
@@ -1558,7 +1782,12 @@ class TestJobProcessing(Test.TestCase):
unittest.mock.patch.object(
ArticleProcessor,
"extract_article_content",
- return_value=("Test Title", "Test content"),
+ return_value=(
+ "Test Title",
+ "Test content",
+ "Test Author",
+ "2024-01-15",
+ ),
),
unittest.mock.patch.object(
ArticleProcessor,
@@ -1727,7 +1956,12 @@ class TestJobProcessing(Test.TestCase):
unittest.mock.patch.object(
ArticleProcessor,
"extract_article_content",
- return_value=("Test Title", "Test content"),
+ return_value=(
+ "Test Title",
+ "Test content",
+ "Test Author",
+ "2024-01-15",
+ ),
),
unittest.mock.patch.object(
ArticleProcessor,