diff options
Diffstat (limited to 'Biz/PodcastItLater/Worker.py')
| -rw-r--r-- | Biz/PodcastItLater/Worker.py | 354 |
1 files changed, 294 insertions, 60 deletions
diff --git a/Biz/PodcastItLater/Worker.py b/Biz/PodcastItLater/Worker.py index 75a111c..5203490 100644 --- a/Biz/PodcastItLater/Worker.py +++ b/Biz/PodcastItLater/Worker.py @@ -129,8 +129,13 @@ class ArticleProcessor: self.s3_client = None @staticmethod - def extract_article_content(url: str) -> tuple[str, str]: - """Extract title and content from article URL using trafilatura. + def extract_article_content( + url: str, + ) -> tuple[str, str, str | None, str | None]: + """Extract title, content, author, and date from article URL. + + Returns: + tuple: (title, content, author, publication_date) Raises: ValueError: If content cannot be downloaded, extracted, or large. @@ -165,6 +170,8 @@ class ArticleProcessor: title = data.get("title", "Untitled Article") content = data.get("text", "") + author = data.get("author") + pub_date = data.get("date") if not content: msg = f"No content extracted from {url}" @@ -179,23 +186,50 @@ class ArticleProcessor: ) content = content[:MAX_ARTICLE_SIZE] - logger.info("Extracted article: %s (%d chars)", title, len(content)) + logger.info( + "Extracted article: %s (%d chars, author: %s, date: %s)", + title, + len(content), + author or "unknown", + pub_date or "unknown", + ) except Exception: logger.exception("Failed to extract content from %s", url) raise else: - return title, content + return title, content, author, pub_date - def text_to_speech(self, text: str, title: str) -> bytes: - """Convert text to speech using OpenAI TTS API. + def text_to_speech( + self, + text: str, + title: str, + author: str | None = None, + pub_date: str | None = None, + ) -> bytes: + """Convert text to speech with intro/outro using OpenAI TTS API. Uses parallel processing for chunks while maintaining order. + Adds intro with metadata and outro with attribution. + + Args: + text: Article content to convert + title: Article title + author: Article author (optional) + pub_date: Publication date (optional) Raises: ValueError: If no chunks are generated from text. """ try: - # Use LLM to prepare and chunk the text + # Generate intro audio + intro_text = self._create_intro_text(title, author, pub_date) + intro_audio = self._generate_tts_segment(intro_text) + + # Generate outro audio + outro_text = self._create_outro_text(title, author) + outro_audio = self._generate_tts_segment(outro_text) + + # Use LLM to prepare and chunk the main content chunks = prepare_text_for_tts(text, title) if not chunks: @@ -212,54 +246,144 @@ class ArticleProcessor: "processing", mem_usage, ) - return self._text_to_speech_serial(chunks) + content_audio_bytes = self._text_to_speech_serial(chunks) + else: + # Determine max workers + max_workers = min( + 4, # Reasonable limit to avoid rate limiting + len(chunks), # No more workers than chunks + max(1, psutil.cpu_count() // 2), # Use half of CPU cores + ) - # Determine max workers based on chunk count and system resources - max_workers = min( - 4, # Reasonable limit to avoid rate limiting - len(chunks), # No more workers than chunks - max(1, psutil.cpu_count() // 2), # Use half of CPU cores - ) + logger.info( + "Using %d workers for parallel TTS processing", + max_workers, + ) - logger.info( - "Using %d workers for parallel TTS processing", - max_workers, + # Process chunks in parallel + chunk_results: list[tuple[int, bytes]] = [] + + with concurrent.futures.ThreadPoolExecutor( + max_workers=max_workers, + ) as executor: + # Submit all chunks for processing + future_to_index = { + executor.submit(self._process_tts_chunk, chunk, i): i + for i, chunk in enumerate(chunks) + } + + # Collect results as they complete + for future in concurrent.futures.as_completed( + future_to_index, + ): + index = future_to_index[future] + try: + audio_data = future.result() + chunk_results.append((index, audio_data)) + except Exception: + logger.exception( + "Failed to process chunk %d", + index, + ) + raise + + # Sort results by index to maintain order + chunk_results.sort(key=operator.itemgetter(0)) + + # Combine audio chunks + content_audio_bytes = self._combine_audio_chunks([ + data for _, data in chunk_results + ]) + + # Combine intro, content, and outro with pauses + return ArticleProcessor._combine_intro_content_outro( + intro_audio, + content_audio_bytes, + outro_audio, ) - # Process chunks in parallel - chunk_results: list[tuple[int, bytes]] = [] - - with concurrent.futures.ThreadPoolExecutor( - max_workers=max_workers, - ) as executor: - # Submit all chunks for processing - future_to_index = { - executor.submit(self._process_tts_chunk, chunk, i): i - for i, chunk in enumerate(chunks) - } - - # Collect results as they complete - for future in concurrent.futures.as_completed(future_to_index): - index = future_to_index[future] - try: - audio_data = future.result() - chunk_results.append((index, audio_data)) - except Exception: - logger.exception("Failed to process chunk %d", index) - raise - - # Sort results by index to maintain order - chunk_results.sort(key=operator.itemgetter(0)) - - # Combine audio chunks - return self._combine_audio_chunks([ - data for _, data in chunk_results - ]) - except Exception: logger.exception("TTS generation failed") raise + @staticmethod + def _create_intro_text( + title: str, + author: str | None, + pub_date: str | None, + ) -> str: + """Create intro text with available metadata.""" + parts = [f"Title: {title}"] + + if author: + parts.append(f"Author: {author}") + + if pub_date: + parts.append(f"Published: {pub_date}") + + return ". ".join(parts) + "." + + @staticmethod + def _create_outro_text(title: str, author: str | None) -> str: + """Create outro text with attribution.""" + if author: + return ( + f"This has been an audio version of {title} " + f"by {author}, created using Podcast It Later." + ) + return ( + f"This has been an audio version of {title}, " + "created using Podcast It Later." + ) + + def _generate_tts_segment(self, text: str) -> bytes: + """Generate TTS audio for a single segment (intro/outro). + + Args: + text: Text to convert to speech + + Returns: + MP3 audio bytes + """ + response = self.openai_client.audio.speech.create( + model=TTS_MODEL, + voice=TTS_VOICE, + input=text, + ) + return response.content + + @staticmethod + def _combine_intro_content_outro( + intro_audio: bytes, + content_audio: bytes, + outro_audio: bytes, + ) -> bytes: + """Combine intro, content, and outro with 1-second pauses. + + Args: + intro_audio: MP3 bytes for intro + content_audio: MP3 bytes for main content + outro_audio: MP3 bytes for outro + + Returns: + Combined MP3 audio bytes + """ + # Load audio segments + intro = AudioSegment.from_mp3(io.BytesIO(intro_audio)) + content = AudioSegment.from_mp3(io.BytesIO(content_audio)) + outro = AudioSegment.from_mp3(io.BytesIO(outro_audio)) + + # Create 1-second silence + pause = AudioSegment.silent(duration=1000) # milliseconds + + # Combine segments with pauses + combined = intro + pause + content + pause + outro + + # Export to bytes + output = io.BytesIO() + combined.export(output, format="mp3") + return output.getvalue() + def _process_tts_chunk(self, chunk: str, index: int) -> bytes: """Process a single TTS chunk. @@ -496,15 +620,17 @@ class ArticleProcessor: return # Step 1: Extract article content - title, content = ArticleProcessor.extract_article_content(url) + title, content, author, pub_date = ( + ArticleProcessor.extract_article_content(url) + ) if self.shutdown_handler.is_shutdown_requested(): logger.info("Shutdown requested, aborting job %d", job_id) Core.Database.update_job_status(job_id, "pending") return - # Step 2: Generate audio - audio_data = self.text_to_speech(content, title) + # Step 2: Generate audio with metadata + audio_data = self.text_to_speech(content, title, author, pub_date) if self.shutdown_handler.is_shutdown_requested(): logger.info("Shutdown requested, aborting job %d", job_id) @@ -922,12 +1048,16 @@ class TestArticleExtraction(Test.TestCase): return_value=mock_result, ), ): - title, content = ArticleProcessor.extract_article_content( - "https://example.com", + title, content, author, pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) ) self.assertEqual(title, "Test Article") self.assertEqual(content, "Content here") + self.assertIsNone(author) + self.assertIsNone(pub_date) def test_extract_missing_title(self) -> None: """Handle articles without titles.""" @@ -944,12 +1074,16 @@ class TestArticleExtraction(Test.TestCase): return_value=mock_result, ), ): - title, content = ArticleProcessor.extract_article_content( - "https://example.com", + title, content, author, pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) ) self.assertEqual(title, "Untitled Article") self.assertEqual(content, "Content without title") + self.assertIsNone(author) + self.assertIsNone(pub_date) def test_extract_empty_content(self) -> None: """Handle empty articles.""" @@ -1020,8 +1154,10 @@ class TestArticleExtraction(Test.TestCase): return_value=mock_result, ), ): - _title, content = ArticleProcessor.extract_article_content( - "https://example.com", + _title, content, _author, _pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) ) self.assertEqual(content, "Good content") @@ -1458,6 +1594,92 @@ class TestTextToSpeech(Test.TestCase): self.assertEqual(audio_data, b"ordered-audio") +class TestIntroOutro(Test.TestCase): + """Test intro and outro generation with metadata.""" + + def test_create_intro_text_full_metadata(self) -> None: + """Test intro text creation with all metadata.""" + intro = ArticleProcessor._create_intro_text( # noqa: SLF001 + title="Test Article", + author="John Doe", + pub_date="2024-01-15", + ) + self.assertIn("Title: Test Article", intro) + self.assertIn("Author: John Doe", intro) + self.assertIn("Published: 2024-01-15", intro) + + def test_create_intro_text_no_author(self) -> None: + """Test intro text without author.""" + intro = ArticleProcessor._create_intro_text( # noqa: SLF001 + title="Test Article", + author=None, + pub_date="2024-01-15", + ) + self.assertIn("Title: Test Article", intro) + self.assertNotIn("Author:", intro) + self.assertIn("Published: 2024-01-15", intro) + + def test_create_intro_text_minimal(self) -> None: + """Test intro text with only title.""" + intro = ArticleProcessor._create_intro_text( # noqa: SLF001 + title="Test Article", + author=None, + pub_date=None, + ) + self.assertEqual(intro, "Title: Test Article.") + + def test_create_outro_text_with_author(self) -> None: + """Test outro text with author.""" + outro = ArticleProcessor._create_outro_text( # noqa: SLF001 + title="Test Article", + author="Jane Smith", + ) + self.assertIn("Test Article", outro) + self.assertIn("Jane Smith", outro) + self.assertIn("Podcast It Later", outro) + + def test_create_outro_text_no_author(self) -> None: + """Test outro text without author.""" + outro = ArticleProcessor._create_outro_text( # noqa: SLF001 + title="Test Article", + author=None, + ) + self.assertIn("Test Article", outro) + self.assertNotIn("by", outro) + self.assertIn("Podcast It Later", outro) + + def test_extract_with_metadata(self) -> None: + """Test that extraction returns metadata.""" + mock_html = "<html><body><p>Content</p></body></html>" + mock_result = json.dumps({ + "title": "Test Article", + "text": "Article content", + "author": "Test Author", + "date": "2024-01-15", + }) + + with ( + unittest.mock.patch( + "trafilatura.fetch_url", + return_value=mock_html, + ), + unittest.mock.patch( + "trafilatura.extract", + return_value=mock_result, + ), + ): + title, content, author, pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) + ) + + self.assertEqual(title, "Test Article") + self.assertEqual(content, "Article content") + self.assertEqual(author, "Test Author") + self.assertEqual(pub_date, "2024-01-15") + + class TestMemoryEfficiency(Test.TestCase): """Test memory-efficient processing.""" @@ -1494,8 +1716,10 @@ class TestMemoryEfficiency(Test.TestCase): return_value=mock_result, ), ): - title, content = ArticleProcessor.extract_article_content( - "https://example.com", + title, content, _author, _pub_date = ( + ArticleProcessor.extract_article_content( + "https://example.com", + ) ) self.assertEqual(title, "Large Article") @@ -1558,7 +1782,12 @@ class TestJobProcessing(Test.TestCase): unittest.mock.patch.object( ArticleProcessor, "extract_article_content", - return_value=("Test Title", "Test content"), + return_value=( + "Test Title", + "Test content", + "Test Author", + "2024-01-15", + ), ), unittest.mock.patch.object( ArticleProcessor, @@ -1727,7 +1956,12 @@ class TestJobProcessing(Test.TestCase): unittest.mock.patch.object( ArticleProcessor, "extract_article_content", - return_value=("Test Title", "Test content"), + return_value=( + "Test Title", + "Test content", + "Test Author", + "2024-01-15", + ), ), unittest.mock.patch.object( ArticleProcessor, |
