1 files changed, 294 insertions, 60 deletions
diff --git a/Biz/PodcastItLater/Worker.py b/Biz/PodcastItLater/Worker.py
index 75a111c..5203490 100644
--- a/Biz/PodcastItLater/Worker.py
+++ b/Biz/PodcastItLater/Worker.py
@@ -129,8 +129,13 @@ class ArticleProcessor:
             self.s3_client = None
 
     @staticmethod
-    def extract_article_content(url: str) -> tuple[str, str]:
-        """Extract title and content from article URL using trafilatura.
+    def extract_article_content(
+        url: str,
+    ) -> tuple[str, str, str | None, str | None]:
+        """Extract title, content, author, and date from article URL.
+
+        Returns:
+            tuple: (title, content, author, publication_date)
 
         Raises:
             ValueError: If content cannot be downloaded, extracted, or large.
@@ -165,6 +170,8 @@ class ArticleProcessor:
 
             title = data.get("title", "Untitled Article")
             content = data.get("text", "")
+            author = data.get("author")
+            pub_date = data.get("date")
 
             if not content:
                 msg = f"No content extracted from {url}"
@@ -179,23 +186,50 @@ class ArticleProcessor:
                 )
                 content = content[:MAX_ARTICLE_SIZE]
 
-            logger.info("Extracted article: %s (%d chars)", title, len(content))
+            logger.info(
+                "Extracted article: %s (%d chars, author: %s, date: %s)",
+                title,
+                len(content),
+                author or "unknown",
+                pub_date or "unknown",
+            )
         except Exception:
             logger.exception("Failed to extract content from %s", url)
             raise
         else:
-            return title, content
+            return title, content, author, pub_date
 
-    def text_to_speech(self, text: str, title: str) -> bytes:
-        """Convert text to speech using OpenAI TTS API.
+    def text_to_speech(
+        self,
+        text: str,
+        title: str,
+        author: str | None = None,
+        pub_date: str | None = None,
+    ) -> bytes:
+        """Convert text to speech with intro/outro using OpenAI TTS API.
 
         Uses parallel processing for chunks while maintaining order.
+        Adds intro with metadata and outro with attribution.
+
+        Args:
+            text: Article content to convert
+            title: Article title
+            author: Article author (optional)
+            pub_date: Publication date (optional)
 
         Raises:
             ValueError: If no chunks are generated from text.
         """
         try:
-            # Use LLM to prepare and chunk the text
+            # Generate intro audio
+            intro_text = self._create_intro_text(title, author, pub_date)
+            intro_audio = self._generate_tts_segment(intro_text)
+
+            # Generate outro audio
+            outro_text = self._create_outro_text(title, author)
+            outro_audio = self._generate_tts_segment(outro_text)
+
+            # Use LLM to prepare and chunk the main content
             chunks = prepare_text_for_tts(text, title)
 
             if not chunks:
@@ -212,54 +246,144 @@ class ArticleProcessor:
                     "processing",
                     mem_usage,
                 )
-                return self._text_to_speech_serial(chunks)
+                content_audio_bytes = self._text_to_speech_serial(chunks)
+            else:
+                # Determine max workers
+                max_workers = min(
+                    4,  # Reasonable limit to avoid rate limiting
+                    len(chunks),  # No more workers than chunks
+                    max(1, psutil.cpu_count() // 2),  # Use half of CPU cores
+                )
 
-            # Determine max workers based on chunk count and system resources
-            max_workers = min(
-                4,  # Reasonable limit to avoid rate limiting
-                len(chunks),  # No more workers than chunks
-                max(1, psutil.cpu_count() // 2),  # Use half of CPU cores
-            )
+                logger.info(
+                    "Using %d workers for parallel TTS processing",
+                    max_workers,
+                )
 
-            logger.info(
-                "Using %d workers for parallel TTS processing",
-                max_workers,
+                # Process chunks in parallel
+                chunk_results: list[tuple[int, bytes]] = []
+
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=max_workers,
+                ) as executor:
+                    # Submit all chunks for processing
+                    future_to_index = {
+                        executor.submit(self._process_tts_chunk, chunk, i): i
+                        for i, chunk in enumerate(chunks)
+                    }
+
+                    # Collect results as they complete
+                    for future in concurrent.futures.as_completed(
+                        future_to_index,
+                    ):
+                        index = future_to_index[future]
+                        try:
+                            audio_data = future.result()
+                            chunk_results.append((index, audio_data))
+                        except Exception:
+                            logger.exception(
+                                "Failed to process chunk %d",
+                                index,
+                            )
+                            raise
+
+                # Sort results by index to maintain order
+                chunk_results.sort(key=operator.itemgetter(0))
+
+                # Combine audio chunks
+                content_audio_bytes = self._combine_audio_chunks([
+                    data for _, data in chunk_results
+                ])
+
+            # Combine intro, content, and outro with pauses
+            return ArticleProcessor._combine_intro_content_outro(
+                intro_audio,
+                content_audio_bytes,
+                outro_audio,
             )
 
-            # Process chunks in parallel
-            chunk_results: list[tuple[int, bytes]] = []
-
-            with concurrent.futures.ThreadPoolExecutor(
-                max_workers=max_workers,
-            ) as executor:
-                # Submit all chunks for processing
-                future_to_index = {
-                    executor.submit(self._process_tts_chunk, chunk, i): i
-                    for i, chunk in enumerate(chunks)
-                }
-
-                # Collect results as they complete
-                for future in concurrent.futures.as_completed(future_to_index):
-                    index = future_to_index[future]
-                    try:
-                        audio_data = future.result()
-                        chunk_results.append((index, audio_data))
-                    except Exception:
-                        logger.exception("Failed to process chunk %d", index)
-                        raise
-
-            # Sort results by index to maintain order
-            chunk_results.sort(key=operator.itemgetter(0))
-
-            # Combine audio chunks
-            return self._combine_audio_chunks([
-                data for _, data in chunk_results
-            ])
-
         except Exception:
             logger.exception("TTS generation failed")
             raise
 
+    @staticmethod
+    def _create_intro_text(
+        title: str,
+        author: str | None,
+        pub_date: str | None,
+    ) -> str:
+        """Create intro text with available metadata."""
+        parts = [f"Title: {title}"]
+
+        if author:
+            parts.append(f"Author: {author}")
+
+        if pub_date:
+            parts.append(f"Published: {pub_date}")
+
+        return ". ".join(parts) + "."
+
+    @staticmethod
+    def _create_outro_text(title: str, author: str | None) -> str:
+        """Create outro text with attribution."""
+        if author:
+            return (
+                f"This has been an audio version of {title} "
+                f"by {author}, created using Podcast It Later."
+            )
+        return (
+            f"This has been an audio version of {title}, "
+            "created using Podcast It Later."
+        )
+
+    def _generate_tts_segment(self, text: str) -> bytes:
+        """Generate TTS audio for a single segment (intro/outro).
+
+        Args:
+            text: Text to convert to speech
+
+        Returns:
+            MP3 audio bytes
+        """
+        response = self.openai_client.audio.speech.create(
+            model=TTS_MODEL,
+            voice=TTS_VOICE,
+            input=text,
+        )
+        return response.content
+
+    @staticmethod
+    def _combine_intro_content_outro(
+        intro_audio: bytes,
+        content_audio: bytes,
+        outro_audio: bytes,
+    ) -> bytes:
+        """Combine intro, content, and outro with 1-second pauses.
+
+        Args:
+            intro_audio: MP3 bytes for intro
+            content_audio: MP3 bytes for main content
+            outro_audio: MP3 bytes for outro
+
+        Returns:
+            Combined MP3 audio bytes
+        """
+        # Load audio segments
+        intro = AudioSegment.from_mp3(io.BytesIO(intro_audio))
+        content = AudioSegment.from_mp3(io.BytesIO(content_audio))
+        outro = AudioSegment.from_mp3(io.BytesIO(outro_audio))
+
+        # Create 1-second silence
+        pause = AudioSegment.silent(duration=1000)  # milliseconds
+
+        # Combine segments with pauses
+        combined = intro + pause + content + pause + outro
+
+        # Export to bytes
+        output = io.BytesIO()
+        combined.export(output, format="mp3")
+        return output.getvalue()
+
     def _process_tts_chunk(self, chunk: str, index: int) -> bytes:
         """Process a single TTS chunk.
 
@@ -496,15 +620,17 @@ class ArticleProcessor:
                 return
 
             # Step 1: Extract article content
-            title, content = ArticleProcessor.extract_article_content(url)
+            title, content, author, pub_date = (
+                ArticleProcessor.extract_article_content(url)
+            )
 
             if self.shutdown_handler.is_shutdown_requested():
                 logger.info("Shutdown requested, aborting job %d", job_id)
                 Core.Database.update_job_status(job_id, "pending")
                 return
 
-            # Step 2: Generate audio
-            audio_data = self.text_to_speech(content, title)
+            # Step 2: Generate audio with metadata
+            audio_data = self.text_to_speech(content, title, author, pub_date)
 
             if self.shutdown_handler.is_shutdown_requested():
                 logger.info("Shutdown requested, aborting job %d", job_id)
@@ -922,12 +1048,16 @@ class TestArticleExtraction(Test.TestCase):
                 return_value=mock_result,
             ),
         ):
-            title, content = ArticleProcessor.extract_article_content(
-                "https://example.com",
+            title, content, author, pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
             )
 
         self.assertEqual(title, "Test Article")
         self.assertEqual(content, "Content here")
+        self.assertIsNone(author)
+        self.assertIsNone(pub_date)
 
     def test_extract_missing_title(self) -> None:
         """Handle articles without titles."""
@@ -944,12 +1074,16 @@ class TestArticleExtraction(Test.TestCase):
                 return_value=mock_result,
             ),
         ):
-            title, content = ArticleProcessor.extract_article_content(
-                "https://example.com",
+            title, content, author, pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
             )
 
         self.assertEqual(title, "Untitled Article")
         self.assertEqual(content, "Content without title")
+        self.assertIsNone(author)
+        self.assertIsNone(pub_date)
 
     def test_extract_empty_content(self) -> None:
         """Handle empty articles."""
@@ -1020,8 +1154,10 @@ class TestArticleExtraction(Test.TestCase):
                 return_value=mock_result,
             ),
         ):
-            _title, content = ArticleProcessor.extract_article_content(
-                "https://example.com",
+            _title, content, _author, _pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
             )
 
         self.assertEqual(content, "Good content")
@@ -1458,6 +1594,92 @@ class TestTextToSpeech(Test.TestCase):
         self.assertEqual(audio_data, b"ordered-audio")
 
 
+class TestIntroOutro(Test.TestCase):
+    """Test intro and outro generation with metadata."""
+
+    def test_create_intro_text_full_metadata(self) -> None:
+        """Test intro text creation with all metadata."""
+        intro = ArticleProcessor._create_intro_text(  # noqa: SLF001
+            title="Test Article",
+            author="John Doe",
+            pub_date="2024-01-15",
+        )
+        self.assertIn("Title: Test Article", intro)
+        self.assertIn("Author: John Doe", intro)
+        self.assertIn("Published: 2024-01-15", intro)
+
+    def test_create_intro_text_no_author(self) -> None:
+        """Test intro text without author."""
+        intro = ArticleProcessor._create_intro_text(  # noqa: SLF001
+            title="Test Article",
+            author=None,
+            pub_date="2024-01-15",
+        )
+        self.assertIn("Title: Test Article", intro)
+        self.assertNotIn("Author:", intro)
+        self.assertIn("Published: 2024-01-15", intro)
+
+    def test_create_intro_text_minimal(self) -> None:
+        """Test intro text with only title."""
+        intro = ArticleProcessor._create_intro_text(  # noqa: SLF001
+            title="Test Article",
+            author=None,
+            pub_date=None,
+        )
+        self.assertEqual(intro, "Title: Test Article.")
+
+    def test_create_outro_text_with_author(self) -> None:
+        """Test outro text with author."""
+        outro = ArticleProcessor._create_outro_text(  # noqa: SLF001
+            title="Test Article",
+            author="Jane Smith",
+        )
+        self.assertIn("Test Article", outro)
+        self.assertIn("Jane Smith", outro)
+        self.assertIn("Podcast It Later", outro)
+
+    def test_create_outro_text_no_author(self) -> None:
+        """Test outro text without author."""
+        outro = ArticleProcessor._create_outro_text(  # noqa: SLF001
+            title="Test Article",
+            author=None,
+        )
+        self.assertIn("Test Article", outro)
+        self.assertNotIn("by", outro)
+        self.assertIn("Podcast It Later", outro)
+
+    def test_extract_with_metadata(self) -> None:
+        """Test that extraction returns metadata."""
+        mock_html = "<html><body><p>Content</p></body></html>"
+        mock_result = json.dumps({
+            "title": "Test Article",
+            "text": "Article content",
+            "author": "Test Author",
+            "date": "2024-01-15",
+        })
+
+        with (
+            unittest.mock.patch(
+                "trafilatura.fetch_url",
+                return_value=mock_html,
+            ),
+            unittest.mock.patch(
+                "trafilatura.extract",
+                return_value=mock_result,
+            ),
+        ):
+            title, content, author, pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
+            )
+
+        self.assertEqual(title, "Test Article")
+        self.assertEqual(content, "Article content")
+        self.assertEqual(author, "Test Author")
+        self.assertEqual(pub_date, "2024-01-15")
+
+
 class TestMemoryEfficiency(Test.TestCase):
     """Test memory-efficient processing."""
 
@@ -1494,8 +1716,10 @@ class TestMemoryEfficiency(Test.TestCase):
                 return_value=mock_result,
             ),
         ):
-            title, content = ArticleProcessor.extract_article_content(
-                "https://example.com",
+            title, content, _author, _pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
             )
 
         self.assertEqual(title, "Large Article")
@@ -1558,7 +1782,12 @@ class TestJobProcessing(Test.TestCase):
             unittest.mock.patch.object(
                 ArticleProcessor,
                 "extract_article_content",
-                return_value=("Test Title", "Test content"),
+                return_value=(
+                    "Test Title",
+                    "Test content",
+                    "Test Author",
+                    "2024-01-15",
+                ),
             ),
             unittest.mock.patch.object(
                 ArticleProcessor,
@@ -1727,7 +1956,12 @@ class TestJobProcessing(Test.TestCase):
             unittest.mock.patch.object(
                 ArticleProcessor,
                 "extract_article_content",
-                return_value=("Test Title", "Test content"),
+                return_value=(
+                    "Test Title",
+                    "Test content",
+                    "Test Author",
+                    "2024-01-15",
+                ),
             ),
             unittest.mock.patch.object(
                 ArticleProcessor,