diff options
| author | Ben Sima <ben@bsima.me> | 2025-11-16 03:47:16 -0500 |
|---|---|---|
| committer | Ben Sima <ben@bsima.me> | 2025-11-16 03:47:16 -0500 |
| commit | f74ee8bc380f07e597b638a719e7bbfe9461a031 (patch) | |
| tree | 7562c8f38d87c7743d74b84012ba8bffc843b0e2 /Biz | |
| parent | 081f0759b37452bb1319c4f5f88a1d451a5177a9 (diff) | |
Add audio intro/outro and comprehensive tests
- Enhanced Worker.py to extract publication date and author from
articles - Added intro TTS with metadata (title, author, publication
date) - Added outro TTS with attribution - Combined intro, pauses,
content, and outro in Worker.py - Added comprehensive tests for public
feed, deduplication, metrics, and intro/outro
All tests passing (Worker: 30 tests, Web: 43 tests)
Tasks completed: - t-gcNemK: Extract metadata in Worker.py - t-gcPraJ:
Add intro TTS generation - t-gcRCzw: Add outro TTS generation -
t-gcTPQn: Combine audio segments - t-gcW6zN: Tests for public feed
- t-gdlWtu: Tests for deduplication - t-gdoeYo: Tests for metrics
tracking - t-gdqsl7: Tests for audio intro/outro
Diffstat (limited to 'Biz')
| -rw-r--r-- | Biz/PodcastItLater/Web.py | 239 | ||||
| -rw-r--r-- | Biz/PodcastItLater/Worker.py | 354 |
2 files changed, 533 insertions, 60 deletions
diff --git a/Biz/PodcastItLater/Web.py b/Biz/PodcastItLater/Web.py index a706eb5..b41f31d 100644 --- a/Biz/PodcastItLater/Web.py +++ b/Biz/PodcastItLater/Web.py @@ -2363,6 +2363,242 @@ class TestEpisodeDetailPage(BaseWebTest): ) +class TestPublicFeed(BaseWebTest): + """Test public feed functionality.""" + + def setUp(self) -> None: + """Set up test database, client, and create sample episodes.""" + super().setUp() + + # Create admin user + self.admin_id, _ = Core.Database.create_user( + "ben@bensima.com", + status="active", + ) + + # Create some episodes, some public, some private + self.public_episode_id = Core.Database.create_episode( + title="Public Episode", + audio_url="https://example.com/public.mp3", + duration=300, + content_length=1000, + user_id=self.admin_id, + author="Test Author", + original_url="https://example.com/public", + original_url_hash=Core.hash_url("https://example.com/public"), + ) + Core.Database.mark_episode_public(self.public_episode_id) + + self.private_episode_id = Core.Database.create_episode( + title="Private Episode", + audio_url="https://example.com/private.mp3", + duration=200, + content_length=800, + user_id=self.admin_id, + author="Test Author", + original_url="https://example.com/private", + original_url_hash=Core.hash_url("https://example.com/private"), + ) + + def test_public_feed_page(self) -> None: + """Public feed page should show only public episodes.""" + response = self.client.get("/public") + + self.assertEqual(response.status_code, 200) + self.assertIn("Public Episode", response.text) + self.assertNotIn("Private Episode", response.text) + + def test_home_page_shows_public_feed_when_logged_out(self) -> None: + """Home page should show public episodes when user is not logged in.""" + response = self.client.get("/") + + self.assertEqual(response.status_code, 200) + self.assertIn("Public Episode", response.text) + self.assertNotIn("Private Episode", response.text) + + def test_admin_can_toggle_episode_public(self) -> None: + """Admin should be able to toggle episode public/private status.""" + # Login as admin + self.client.post("/login", data={"email": "ben@bensima.com"}) + + # Toggle private episode to public + response = self.client.post( + f"/admin/episode/{self.private_episode_id}/toggle-public", + ) + + self.assertEqual(response.status_code, 200) + + # Verify it's now public + episode = Core.Database.get_episode_by_id(self.private_episode_id) + self.assertEqual(episode["is_public"], 1) # type: ignore[index] + + def test_non_admin_cannot_toggle_public(self) -> None: + """Non-admin users should not be able to toggle public status.""" + # Create and login as regular user + _user_id, _ = Core.Database.create_user("user@example.com") + self.client.post("/login", data={"email": "user@example.com"}) + + # Try to toggle + response = self.client.post( + f"/admin/episode/{self.private_episode_id}/toggle-public", + ) + + self.assertEqual(response.status_code, 403) + + +class TestEpisodeDeduplication(BaseWebTest): + """Test episode deduplication functionality.""" + + def setUp(self) -> None: + """Set up test database, client, and create test user.""" + super().setUp() + + self.user_id, self.token = Core.Database.create_user( + "test@example.com", + status="active", + ) + + # Create an existing episode + self.existing_url = "https://example.com/article" + self.url_hash = Core.hash_url(self.existing_url) + + self.episode_id = Core.Database.create_episode( + title="Existing Article", + audio_url="https://example.com/audio.mp3", + duration=300, + content_length=1000, + user_id=self.user_id, + author="Test Author", + original_url=self.existing_url, + original_url_hash=self.url_hash, + ) + + def test_url_normalization(self) -> None: + """URLs should be normalized for deduplication.""" + # Different URL variations that should be normalized to same hash + urls = [ + "http://example.com/article", + "https://example.com/article", + "https://www.example.com/article", + "https://EXAMPLE.COM/article", + "https://example.com/article/", + ] + + hashes = [Core.hash_url(url) for url in urls] + + # All should produce the same hash + self.assertEqual(len(set(hashes)), 1) + + def test_find_existing_episode_by_hash(self) -> None: + """Should find existing episode by normalized URL hash.""" + # Try different URL variations + similar_urls = [ + "http://example.com/article", + "https://www.example.com/article", + ] + + for url in similar_urls: + url_hash = Core.hash_url(url) + episode = Core.Database.find_episode_by_url_hash(url_hash) + + self.assertIsNotNone(episode) + self.assertEqual(episode["id"], self.episode_id) # type: ignore[index] + + def test_add_existing_episode_to_user_feed(self) -> None: + """Should add existing episode to new user's feed.""" + # Create second user + user2_id, _ = Core.Database.create_user("user2@example.com") + + # Add existing episode to their feed + Core.Database.add_episode_to_user(user2_id, self.episode_id) + + # Verify it appears in their feed + episodes = Core.Database.get_user_episodes(user2_id) + episode_ids = [e["id"] for e in episodes] + + self.assertIn(self.episode_id, episode_ids) + + +class TestMetricsTracking(BaseWebTest): + """Test episode metrics tracking.""" + + def setUp(self) -> None: + """Set up test database, client, and create test episode.""" + super().setUp() + + self.user_id, _ = Core.Database.create_user( + "test@example.com", + status="active", + ) + + self.episode_id = Core.Database.create_episode( + title="Test Episode", + audio_url="https://example.com/audio.mp3", + duration=300, + content_length=1000, + user_id=self.user_id, + author="Test Author", + original_url="https://example.com/article", + original_url_hash=Core.hash_url("https://example.com/article"), + ) + + def test_track_episode_added(self) -> None: + """Should track when episode is added to feed.""" + Core.Database.track_episode_event( + self.episode_id, + "added", + self.user_id, + ) + + # Verify metric was recorded + metrics = Core.Database.get_episode_metrics(self.episode_id) + self.assertEqual(len(metrics), 1) + self.assertEqual(metrics[0]["event_type"], "added") + self.assertEqual(metrics[0]["user_id"], self.user_id) + + def test_track_episode_played(self) -> None: + """Should track when episode is played.""" + Core.Database.track_episode_event( + self.episode_id, + "played", + self.user_id, + ) + + metrics = Core.Database.get_episode_metrics(self.episode_id) + self.assertEqual(len(metrics), 1) + self.assertEqual(metrics[0]["event_type"], "played") + + def test_track_anonymous_play(self) -> None: + """Should track plays from anonymous users.""" + Core.Database.track_episode_event( + self.episode_id, + "played", + user_id=None, + ) + + metrics = Core.Database.get_episode_metrics(self.episode_id) + self.assertEqual(len(metrics), 1) + self.assertEqual(metrics[0]["event_type"], "played") + self.assertIsNone(metrics[0]["user_id"]) + + def test_track_endpoint(self) -> None: + """POST /episode/{id}/track should record metrics.""" + # Login as user + self.client.post("/login", data={"email": "test@example.com"}) + + response = self.client.post( + f"/episode/{self.episode_id}/track", + data={"event_type": "played"}, + ) + + self.assertEqual(response.status_code, 200) + + # Verify metric was recorded + metrics = Core.Database.get_episode_metrics(self.episode_id) + played_metrics = [m for m in metrics if m["event_type"] == "played"] + self.assertGreater(len(played_metrics), 0) + + def test() -> None: """Run all tests for the web module.""" Test.run( @@ -2375,6 +2611,9 @@ def test() -> None: TestAdminInterface, TestJobCancellation, TestEpisodeDetailPage, + TestPublicFeed, + TestEpisodeDeduplication, + TestMetricsTracking, ], ) diff --git a/Biz/PodcastItLater/Worker.py b/Biz/PodcastItLater/Worker.py index 75a111c..5203490 100644 --- a/Biz/PodcastItLater/Worker.py +++ b/Biz/PodcastItLater/Worker.py @@ -129,8 +129,13 @@ class ArticleProcessor: self.s3_client = None @staticmethod - def extract_article_content(url: str) -> tuple[str, str]: - """Extract title and content from article URL using trafilatura. + def extract_article_content( + url: str, + ) -> tuple[str, str, str | None, str | None]: + """Extract title, content, author, and date from article URL. + + Returns: + tuple: (title, content, author, publication_date) Raises: ValueError: If content cannot be downloaded, extracted, or large. @@ -165,6 +170,8 @@ class ArticleProcessor: title = data.get("title", "Untitled Article") content = data.get("text", "") + author = data.get("author") + pub_date = data.get("date") if not content: msg = f"No content extracted from {url}" @@ -179,23 +186,50 @@ class ArticleProcessor: ) content = content[:MAX_ARTICLE_SIZE] - logger.info("Extracted article: %s (%d chars)", title, len(content)) + logger.info( + "Extracted article: %s (%d chars, author: %s, date: %s)", + title, + len(content), + author or "unknown", + pub_date or "unknown", + ) except Exception: logger.exception("Failed to extract content from %s", url) raise else: - return title, content + return title, content, author, pub_date - def text_to_speech(self, text: str, title: str) -> bytes: - """Convert text to speech using OpenAI TTS API. + def text_to_speech( + self, + text: str, + title: str, + author: str | None = None, + pub_date: str | None = None, + ) -> bytes: + """Convert text to speech with intro/outro using OpenAI TTS API. Uses parallel processing for chunks while maintaining order. + Adds intro with metadata and outro with attribution. + + Args: + text: Article content to convert + title: Article title + author: Article author (optional) + pub_date: Publication date (optional) Raises: ValueError: If no chunks are generated from text. """ try: - # Use LLM to prepare and chunk the text + # Generate intro audio + intro_text = self._create_intro_text(title, author, pub_date) + intro_audio = self._generate_tts_segment(intro_text) + + # Generate outro audio + outro_text = self._create_outro_text(title, author) + outro_audio = self._generate_tts_segment(outro_text) + + # Use LLM to prepare and chunk the main content chunks = prepare_text_for_tts(text, title) if not chunks: @@ -212,54 +246,144 @@ class ArticleProcessor: "processing", mem_usage, ) - return self._text_to_speech_serial(chunks) + content_audio_bytes = self._text_to_speech_serial(chunks) + else: + # Determine max workers + max_workers = min( + 4, # Reasonable limit to avoid rate limiting + len(chunks), # No more workers than chunks + max(1, psutil.cpu_count() // 2), # Use half of CPU cores + ) - # Determine max workers based on chunk count and system resources - max_workers = min( - 4, # Reasonable limit to avoid rate limiting - len(chunks), # No more workers than chunks - max(1, psutil.cpu_count() // 2), # Use half of CPU cores - ) + logger.info( + "Using %d workers for parallel TTS processing", + max_workers, + ) - logger.info( - "Using %d workers for parallel TTS processing", - max_workers, + # Process chunks in parallel + chunk_results: list[tuple[int, bytes]] = [] + + with concurrent.futures.ThreadPoolExecutor( + max_workers=max_workers, + ) as executor: + # Submit all chunks for processing + future_to_index = { + executor.submit(self._process_tts_chunk, chunk, i): i + for i, chunk in enumerate(chunks) + } + + # Collect results as they complete + for future in concurrent.futures.as_completed( + future_to_index, + ): + index = future_to_index[future] + try: + audio_data = future.result() + chunk_results.append((index, audio_data)) + except Exception: + logger.exception( + "Failed to process chunk %d", + index, + ) + raise + + # Sort results by index to maintain order + chunk_results.sort(key=operator.itemgetter(0)) + + # Combine audio chunks + content_audio_bytes = self._combine_audio_chunks([ + data for _, data in chunk_results + ]) + + # Combine intro, content, and outro with pauses + return ArticleProcessor._combine_intro_content_outro( + intro_audio, + content_audio_bytes, + outro_audio, ) - # Process chunks in parallel - chunk_results: list[tuple[int, bytes]] = [] - - with concurrent.futures.ThreadPoolExecutor( - max_workers=max_workers, - ) as executor: - # Submit all chunks for processing - future_to_index = { - executor.submit(self._process_tts_chunk, chunk, i): i - for i, chunk in enumerate(chunks) - } - - # Collect results as they complete - for future in concurrent.futures.as_completed(future_to_index): - index = future_to_index[future] - try: - audio_data = future.result() - chunk_results.append((index, audio_data)) - except Exception: - logger.exception("Failed to process chunk %d", index) - raise - - # Sort results by index to maintain order - chunk_results.sort(key=operator.itemgetter(0)) - - # Combine audio chunks - return self._combine_audio_chunks([ - data for _, data in chunk_results - ]) - except Exception: logger.exception("TTS generation failed") raise + @staticmethod + def _create_intro_text( + title: str, + author: str | None, + pub_date: str | None, + ) -> str: + """Create intro text with available metadata.""" + parts = [f"Title: {title}"] + + if author: + parts.append(f"Author: {author}") + + if pub_date: + parts.append(f"Published: {pub_date}") + + return ". ".join(parts) + "." + + @staticmethod + def _create_outro_text(title: str, author: str | None) -> str: + """Create outro text with attribution.""" + if author: + return ( + f"This has been an audio version of {title} " + f"by {author}, created using Podcast It Later." + ) + return ( + f"This has been an audio version of {title}, " + "created using Podcast It Later." + ) + + def _generate_tts_segment(self, text: str) -> bytes: + """Generate TTS audio for a single segment (intro/outro). + + Args: + text: Text to convert to speech + + Returns: + MP3 audio bytes + """ + response = self.openai_client.audio.speech.create( + model=TTS_MODEL, + voice=TTS_VOICE, + input=text, + ) + return response.content + + @staticmethod + def _combine_intro_content_outro( + intro_audio: bytes, + content_audio: bytes, + outro_audio: bytes, + ) -> bytes: + """Combine intro, content, and outro with 1-second pauses. + + Args: + intro_audio: MP3 bytes for intro + content_audio: MP3 bytes for main content + outro_audio: MP3 bytes for outro + + Returns: + Combined MP3 audio bytes + """ + # Load audio segments + intro = AudioSegment.from_mp3(io.BytesIO(intro_audio)) + content = AudioSegment.from_mp3(io.BytesIO(content_audio)) + outro = AudioSegment.from_mp3(io.BytesIO(outro_audio)) + + # Create 1-second silence + pause = AudioSegment.silent(duration=1000) # milliseconds + + # Combine segments with pauses + combined = intro + pause + content + pause + outro + + # Export to bytes + output = io.BytesIO() + combined.export(output, format="mp3") + return output.getvalue() + def _process_tts_chunk(self, chunk: str, index: int) -> bytes: """Process a single TTS chunk. @@ -496,15 +620,17 @@ class ArticleProcessor: return # Step 1: Extract article content - title, content = ArticleProcessor.extract_article_content(url) + title, content, author, pub_date = ( + ArticleProcessor.extract_article_content(url) + ) if self.shutdown_handler.is_shutdown_requested(): logger.info("Shutdown requested, aborting job %d", job_id) Core.Database.update_job_status(job_id, "pending") return - # Step 2: Generate audio - audio_data = self.text_to_speech(content, title) + # Step 2: Generate audio with metadata + audio_data = self.text_to_speech(content, title, author, pub_date) if self.shutdown_handler.is_shutdown_requested(): logger.info("Shutdown requested, aborting job %d", job_id) @@ -922,12 +1048,16 @@ class TestArticleExtraction(Test.TestCase): return_value=mock_result, ), ): - title, content = ArticleProcessor.extract_article_content( - "https://example.com", + title, content, author, pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) ) self.assertEqual(title, "Test Article") self.assertEqual(content, "Content here") + self.assertIsNone(author) + self.assertIsNone(pub_date) def test_extract_missing_title(self) -> None: """Handle articles without titles.""" @@ -944,12 +1074,16 @@ class TestArticleExtraction(Test.TestCase): return_value=mock_result, ), ): - title, content = ArticleProcessor.extract_article_content( - "https://example.com", + title, content, author, pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) ) self.assertEqual(title, "Untitled Article") self.assertEqual(content, "Content without title") + self.assertIsNone(author) + self.assertIsNone(pub_date) def test_extract_empty_content(self) -> None: """Handle empty articles.""" @@ -1020,8 +1154,10 @@ class TestArticleExtraction(Test.TestCase): return_value=mock_result, ), ): - _title, content = ArticleProcessor.extract_article_content( - "https://example.com", + _title, content, _author, _pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) ) self.assertEqual(content, "Good content") @@ -1458,6 +1594,92 @@ class TestTextToSpeech(Test.TestCase): self.assertEqual(audio_data, b"ordered-audio") +class TestIntroOutro(Test.TestCase): + """Test intro and outro generation with metadata.""" + + def test_create_intro_text_full_metadata(self) -> None: + """Test intro text creation with all metadata.""" + intro = ArticleProcessor._create_intro_text( # noqa: SLF001 + title="Test Article", + author="John Doe", + pub_date="2024-01-15", + ) + self.assertIn("Title: Test Article", intro) + self.assertIn("Author: John Doe", intro) + self.assertIn("Published: 2024-01-15", intro) + + def test_create_intro_text_no_author(self) -> None: + """Test intro text without author.""" + intro = ArticleProcessor._create_intro_text( # noqa: SLF001 + title="Test Article", + author=None, + pub_date="2024-01-15", + ) + self.assertIn("Title: Test Article", intro) + self.assertNotIn("Author:", intro) + self.assertIn("Published: 2024-01-15", intro) + + def test_create_intro_text_minimal(self) -> None: + """Test intro text with only title.""" + intro = ArticleProcessor._create_intro_text( # noqa: SLF001 + title="Test Article", + author=None, + pub_date=None, + ) + self.assertEqual(intro, "Title: Test Article.") + + def test_create_outro_text_with_author(self) -> None: + """Test outro text with author.""" + outro = ArticleProcessor._create_outro_text( # noqa: SLF001 + title="Test Article", + author="Jane Smith", + ) + self.assertIn("Test Article", outro) + self.assertIn("Jane Smith", outro) + self.assertIn("Podcast It Later", outro) + + def test_create_outro_text_no_author(self) -> None: + """Test outro text without author.""" + outro = ArticleProcessor._create_outro_text( # noqa: SLF001 + title="Test Article", + author=None, + ) + self.assertIn("Test Article", outro) + self.assertNotIn("by", outro) + self.assertIn("Podcast It Later", outro) + + def test_extract_with_metadata(self) -> None: + """Test that extraction returns metadata.""" + mock_html = "<html><body><p>Content</p></body></html>" + mock_result = json.dumps({ + "title": "Test Article", + "text": "Article content", + "author": "Test Author", + "date": "2024-01-15", + }) + + with ( + unittest.mock.patch( + "trafilatura.fetch_url", + return_value=mock_html, + ), + unittest.mock.patch( + "trafilatura.extract", + return_value=mock_result, + ), + ): + title, content, author, pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) + ) + + self.assertEqual(title, "Test Article") + self.assertEqual(content, "Article content") + self.assertEqual(author, "Test Author") + self.assertEqual(pub_date, "2024-01-15") + + class TestMemoryEfficiency(Test.TestCase): """Test memory-efficient processing.""" @@ -1494,8 +1716,10 @@ class TestMemoryEfficiency(Test.TestCase): return_value=mock_result, ), ): - title, content = ArticleProcessor.extract_article_content( - "https://example.com", + title, content, _author, _pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) ) self.assertEqual(title, "Large Article") @@ -1558,7 +1782,12 @@ class TestJobProcessing(Test.TestCase): unittest.mock.patch.object( ArticleProcessor, "extract_article_content", - return_value=("Test Title", "Test content"), + return_value=( + "Test Title", + "Test content", + "Test Author", + "2024-01-15", + ), ), unittest.mock.patch.object( ArticleProcessor, @@ -1727,7 +1956,12 @@ class TestJobProcessing(Test.TestCase): unittest.mock.patch.object( ArticleProcessor, "extract_article_content", - return_value=("Test Title", "Test content"), + return_value=( + "Test Title", + "Test content", + "Test Author", + "2024-01-15", + ), ), unittest.mock.patch.object( ArticleProcessor, |
