Add audio intro/outro and comprehensive tests

- Enhanced Worker.py to extract publication date and author from articles - Added intro TTS with metadata (title, author, publication date) - Added outro TTS with attribution - Combined intro, pauses, content, and outro in Worker.py - Added comprehensive tests for public feed, deduplication, metrics, and intro/outro All tests passing (Worker: 30 tests, Web: 43 tests) Tasks completed: - t-gcNemK: Extract metadata in Worker.py - t-gcPraJ: Add intro TTS generation - t-gcRCzw: Add outro TTS generation - t-gcTPQn: Combine audio segments - t-gcW6zN: Tests for public feed - t-gdlWtu: Tests for deduplication - t-gdoeYo: Tests for metrics tracking - t-gdqsl7: Tests for audio intro/outro
author: Ben Sima <ben@bsima.me> 2025-11-16 03:47:16 -0500
committer: Ben Sima <ben@bsima.me> 2025-11-16 03:47:16 -0500
commit: f74ee8bc380f07e597b638a719e7bbfe9461a031 (patch)
tree: 7562c8f38d87c7743d74b84012ba8bffc843b0e2
parent: 081f0759b37452bb1319c4f5f88a1d451a5177a9 (diff)
3 files changed, 543 insertions, 70 deletions
diff --git a/.tasks/tasks.jsonl b/.tasks/tasks.jsonl
index f86f9ab..132d31a 100644
--- a/.tasks/tasks.jsonl
+++ b/.tasks/tasks.jsonl
@@ -99,14 +99,14 @@
 {"taskCreatedAt":"2025-11-16T04:07:17.092115521Z","taskDependencies":[],"taskId":"t-gc6Vrk","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add POST /admin/episode/{id}/toggle-public endpoint","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:13:58.727479053Z"}
 {"taskCreatedAt":"2025-11-16T04:07:17.6266109Z","taskDependencies":[],"taskId":"t-gc9aud","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add '+ Add to your feed' button on episode pages for logged-in users","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:22:35.253656788Z"}
 {"taskCreatedAt":"2025-11-16T04:07:18.165342861Z","taskDependencies":[],"taskId":"t-gcbqDl","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add POST /episode/{id}/add-to-feed endpoint","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:22:35.305050805Z"}
-{"taskCreatedAt":"2025-11-16T04:07:18.700573408Z","taskDependencies":[],"taskId":"t-gcdFSb","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Add POST /episode/{id}/track endpoint for metrics tracking","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:18.700573408Z"}
-{"taskCreatedAt":"2025-11-16T04:07:19.229153372Z","taskDependencies":[],"taskId":"t-gcfTnG","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Add JavaScript to episode player for tracking play events","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:19.229153372Z"}
-{"taskCreatedAt":"2025-11-16T04:07:27.174644219Z","taskDependencies":[],"taskId":"t-gcNemK","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Enhance Worker.py to extract publication date and author metadata from articles","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:27.174644219Z"}
-{"taskCreatedAt":"2025-11-16T04:07:27.700527081Z","taskDependencies":[],"taskId":"t-gcPraJ","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Add intro TTS generation with metadata (title, author, date)","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:27.700527081Z"}
-{"taskCreatedAt":"2025-11-16T04:07:28.221004581Z","taskDependencies":[],"taskId":"t-gcRCzw","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Add outro TTS generation with title and author","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:28.221004581Z"}
-{"taskCreatedAt":"2025-11-16T04:07:28.74867703Z","taskDependencies":[],"taskId":"t-gcTPQn","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Combine intro, pauses, article content, and outro in Worker.py","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:28.74867703Z"}
-{"taskCreatedAt":"2025-11-16T04:07:29.289653388Z","taskDependencies":[],"taskId":"t-gcW6zN","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Write tests for public feed functionality","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:29.289653388Z"}
-{"taskCreatedAt":"2025-11-16T04:07:35.447349966Z","taskDependencies":[],"taskId":"t-gdlWtu","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Write tests for episode deduplication","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:35.447349966Z"}
-{"taskCreatedAt":"2025-11-16T04:07:35.995113703Z","taskDependencies":[],"taskId":"t-gdoeYo","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Write tests for metrics tracking","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:35.995113703Z"}
-{"taskCreatedAt":"2025-11-16T04:07:36.52315156Z","taskDependencies":[],"taskId":"t-gdqsl7","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Write tests for audio intro/outro generation","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:36.52315156Z"}
+{"taskCreatedAt":"2025-11-16T04:07:18.700573408Z","taskDependencies":[],"taskId":"t-gcdFSb","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add POST /episode/{id}/track endpoint for metrics tracking","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:30:51.238117273Z"}
+{"taskCreatedAt":"2025-11-16T04:07:19.229153372Z","taskDependencies":[],"taskId":"t-gcfTnG","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add JavaScript to episode player for tracking play events","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:30:51.289470508Z"}
+{"taskCreatedAt":"2025-11-16T04:07:27.174644219Z","taskDependencies":[],"taskId":"t-gcNemK","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Enhance Worker.py to extract publication date and author metadata from articles","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.196162661Z"}
+{"taskCreatedAt":"2025-11-16T04:07:27.700527081Z","taskDependencies":[],"taskId":"t-gcPraJ","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add intro TTS generation with metadata (title, author, date)","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.247694148Z"}
+{"taskCreatedAt":"2025-11-16T04:07:28.221004581Z","taskDependencies":[],"taskId":"t-gcRCzw","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Add outro TTS generation with title and author","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.298838151Z"}
+{"taskCreatedAt":"2025-11-16T04:07:28.74867703Z","taskDependencies":[],"taskId":"t-gcTPQn","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Combine intro, pauses, article content, and outro in Worker.py","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.360155369Z"}
+{"taskCreatedAt":"2025-11-16T04:07:29.289653388Z","taskDependencies":[],"taskId":"t-gcW6zN","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Write tests for public feed functionality","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.410867588Z"}
+{"taskCreatedAt":"2025-11-16T04:07:35.447349966Z","taskDependencies":[],"taskId":"t-gdlWtu","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Write tests for episode deduplication","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.461656748Z"}
+{"taskCreatedAt":"2025-11-16T04:07:35.995113703Z","taskDependencies":[],"taskId":"t-gdoeYo","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Write tests for metrics tracking","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.513956262Z"}
+{"taskCreatedAt":"2025-11-16T04:07:36.52315156Z","taskDependencies":[],"taskId":"t-gdqsl7","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Done","taskTitle":"Write tests for audio intro/outro generation","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T08:46:43.574397661Z"}
 {"taskCreatedAt":"2025-11-16T04:07:37.059671738Z","taskDependencies":[],"taskId":"t-gdsHUA","taskNamespace":"Biz/PodcastItLater.hs","taskParent":"t-ga8V8O","taskStatus":"Open","taskTitle":"Create admin metrics dashboard view","taskType":"WorkTask","taskUpdatedAt":"2025-11-16T04:07:37.059671738Z"}
diff --git a/Biz/PodcastItLater/Web.py b/Biz/PodcastItLater/Web.py
index a706eb5..b41f31d 100644
--- a/Biz/PodcastItLater/Web.py
+++ b/Biz/PodcastItLater/Web.py
@@ -2363,6 +2363,242 @@ class TestEpisodeDetailPage(BaseWebTest):
         )
 
 
+class TestPublicFeed(BaseWebTest):
+    """Test public feed functionality."""
+
+    def setUp(self) -> None:
+        """Set up test database, client, and create sample episodes."""
+        super().setUp()
+
+        # Create admin user
+        self.admin_id, _ = Core.Database.create_user(
+            "ben@bensima.com",
+            status="active",
+        )
+
+        # Create some episodes, some public, some private
+        self.public_episode_id = Core.Database.create_episode(
+            title="Public Episode",
+            audio_url="https://example.com/public.mp3",
+            duration=300,
+            content_length=1000,
+            user_id=self.admin_id,
+            author="Test Author",
+            original_url="https://example.com/public",
+            original_url_hash=Core.hash_url("https://example.com/public"),
+        )
+        Core.Database.mark_episode_public(self.public_episode_id)
+
+        self.private_episode_id = Core.Database.create_episode(
+            title="Private Episode",
+            audio_url="https://example.com/private.mp3",
+            duration=200,
+            content_length=800,
+            user_id=self.admin_id,
+            author="Test Author",
+            original_url="https://example.com/private",
+            original_url_hash=Core.hash_url("https://example.com/private"),
+        )
+
+    def test_public_feed_page(self) -> None:
+        """Public feed page should show only public episodes."""
+        response = self.client.get("/public")
+
+        self.assertEqual(response.status_code, 200)
+        self.assertIn("Public Episode", response.text)
+        self.assertNotIn("Private Episode", response.text)
+
+    def test_home_page_shows_public_feed_when_logged_out(self) -> None:
+        """Home page should show public episodes when user is not logged in."""
+        response = self.client.get("/")
+
+        self.assertEqual(response.status_code, 200)
+        self.assertIn("Public Episode", response.text)
+        self.assertNotIn("Private Episode", response.text)
+
+    def test_admin_can_toggle_episode_public(self) -> None:
+        """Admin should be able to toggle episode public/private status."""
+        # Login as admin
+        self.client.post("/login", data={"email": "ben@bensima.com"})
+
+        # Toggle private episode to public
+        response = self.client.post(
+            f"/admin/episode/{self.private_episode_id}/toggle-public",
+        )
+
+        self.assertEqual(response.status_code, 200)
+
+        # Verify it's now public
+        episode = Core.Database.get_episode_by_id(self.private_episode_id)
+        self.assertEqual(episode["is_public"], 1)  # type: ignore[index]
+
+    def test_non_admin_cannot_toggle_public(self) -> None:
+        """Non-admin users should not be able to toggle public status."""
+        # Create and login as regular user
+        _user_id, _ = Core.Database.create_user("user@example.com")
+        self.client.post("/login", data={"email": "user@example.com"})
+
+        # Try to toggle
+        response = self.client.post(
+            f"/admin/episode/{self.private_episode_id}/toggle-public",
+        )
+
+        self.assertEqual(response.status_code, 403)
+
+
+class TestEpisodeDeduplication(BaseWebTest):
+    """Test episode deduplication functionality."""
+
+    def setUp(self) -> None:
+        """Set up test database, client, and create test user."""
+        super().setUp()
+
+        self.user_id, self.token = Core.Database.create_user(
+            "test@example.com",
+            status="active",
+        )
+
+        # Create an existing episode
+        self.existing_url = "https://example.com/article"
+        self.url_hash = Core.hash_url(self.existing_url)
+
+        self.episode_id = Core.Database.create_episode(
+            title="Existing Article",
+            audio_url="https://example.com/audio.mp3",
+            duration=300,
+            content_length=1000,
+            user_id=self.user_id,
+            author="Test Author",
+            original_url=self.existing_url,
+            original_url_hash=self.url_hash,
+        )
+
+    def test_url_normalization(self) -> None:
+        """URLs should be normalized for deduplication."""
+        # Different URL variations that should be normalized to same hash
+        urls = [
+            "http://example.com/article",
+            "https://example.com/article",
+            "https://www.example.com/article",
+            "https://EXAMPLE.COM/article",
+            "https://example.com/article/",
+        ]
+
+        hashes = [Core.hash_url(url) for url in urls]
+
+        # All should produce the same hash
+        self.assertEqual(len(set(hashes)), 1)
+
+    def test_find_existing_episode_by_hash(self) -> None:
+        """Should find existing episode by normalized URL hash."""
+        # Try different URL variations
+        similar_urls = [
+            "http://example.com/article",
+            "https://www.example.com/article",
+        ]
+
+        for url in similar_urls:
+            url_hash = Core.hash_url(url)
+            episode = Core.Database.find_episode_by_url_hash(url_hash)
+
+            self.assertIsNotNone(episode)
+            self.assertEqual(episode["id"], self.episode_id)  # type: ignore[index]
+
+    def test_add_existing_episode_to_user_feed(self) -> None:
+        """Should add existing episode to new user's feed."""
+        # Create second user
+        user2_id, _ = Core.Database.create_user("user2@example.com")
+
+        # Add existing episode to their feed
+        Core.Database.add_episode_to_user(user2_id, self.episode_id)
+
+        # Verify it appears in their feed
+        episodes = Core.Database.get_user_episodes(user2_id)
+        episode_ids = [e["id"] for e in episodes]
+
+        self.assertIn(self.episode_id, episode_ids)
+
+
+class TestMetricsTracking(BaseWebTest):
+    """Test episode metrics tracking."""
+
+    def setUp(self) -> None:
+        """Set up test database, client, and create test episode."""
+        super().setUp()
+
+        self.user_id, _ = Core.Database.create_user(
+            "test@example.com",
+            status="active",
+        )
+
+        self.episode_id = Core.Database.create_episode(
+            title="Test Episode",
+            audio_url="https://example.com/audio.mp3",
+            duration=300,
+            content_length=1000,
+            user_id=self.user_id,
+            author="Test Author",
+            original_url="https://example.com/article",
+            original_url_hash=Core.hash_url("https://example.com/article"),
+        )
+
+    def test_track_episode_added(self) -> None:
+        """Should track when episode is added to feed."""
+        Core.Database.track_episode_event(
+            self.episode_id,
+            "added",
+            self.user_id,
+        )
+
+        # Verify metric was recorded
+        metrics = Core.Database.get_episode_metrics(self.episode_id)
+        self.assertEqual(len(metrics), 1)
+        self.assertEqual(metrics[0]["event_type"], "added")
+        self.assertEqual(metrics[0]["user_id"], self.user_id)
+
+    def test_track_episode_played(self) -> None:
+        """Should track when episode is played."""
+        Core.Database.track_episode_event(
+            self.episode_id,
+            "played",
+            self.user_id,
+        )
+
+        metrics = Core.Database.get_episode_metrics(self.episode_id)
+        self.assertEqual(len(metrics), 1)
+        self.assertEqual(metrics[0]["event_type"], "played")
+
+    def test_track_anonymous_play(self) -> None:
+        """Should track plays from anonymous users."""
+        Core.Database.track_episode_event(
+            self.episode_id,
+            "played",
+            user_id=None,
+        )
+
+        metrics = Core.Database.get_episode_metrics(self.episode_id)
+        self.assertEqual(len(metrics), 1)
+        self.assertEqual(metrics[0]["event_type"], "played")
+        self.assertIsNone(metrics[0]["user_id"])
+
+    def test_track_endpoint(self) -> None:
+        """POST /episode/{id}/track should record metrics."""
+        # Login as user
+        self.client.post("/login", data={"email": "test@example.com"})
+
+        response = self.client.post(
+            f"/episode/{self.episode_id}/track",
+            data={"event_type": "played"},
+        )
+
+        self.assertEqual(response.status_code, 200)
+
+        # Verify metric was recorded
+        metrics = Core.Database.get_episode_metrics(self.episode_id)
+        played_metrics = [m for m in metrics if m["event_type"] == "played"]
+        self.assertGreater(len(played_metrics), 0)
+
+
 def test() -> None:
     """Run all tests for the web module."""
     Test.run(
@@ -2375,6 +2611,9 @@ def test() -> None:
             TestAdminInterface,
             TestJobCancellation,
             TestEpisodeDetailPage,
+            TestPublicFeed,
+            TestEpisodeDeduplication,
+            TestMetricsTracking,
         ],
     )
 
diff --git a/Biz/PodcastItLater/Worker.py b/Biz/PodcastItLater/Worker.py
index 75a111c..5203490 100644
--- a/Biz/PodcastItLater/Worker.py
+++ b/Biz/PodcastItLater/Worker.py
@@ -129,8 +129,13 @@ class ArticleProcessor:
             self.s3_client = None
 
     @staticmethod
-    def extract_article_content(url: str) -> tuple[str, str]:
-        """Extract title and content from article URL using trafilatura.
+    def extract_article_content(
+        url: str,
+    ) -> tuple[str, str, str | None, str | None]:
+        """Extract title, content, author, and date from article URL.
+
+        Returns:
+            tuple: (title, content, author, publication_date)
 
         Raises:
             ValueError: If content cannot be downloaded, extracted, or large.
@@ -165,6 +170,8 @@ class ArticleProcessor:
 
             title = data.get("title", "Untitled Article")
             content = data.get("text", "")
+            author = data.get("author")
+            pub_date = data.get("date")
 
             if not content:
                 msg = f"No content extracted from {url}"
@@ -179,23 +186,50 @@ class ArticleProcessor:
                 )
                 content = content[:MAX_ARTICLE_SIZE]
 
-            logger.info("Extracted article: %s (%d chars)", title, len(content))
+            logger.info(
+                "Extracted article: %s (%d chars, author: %s, date: %s)",
+                title,
+                len(content),
+                author or "unknown",
+                pub_date or "unknown",
+            )
         except Exception:
             logger.exception("Failed to extract content from %s", url)
             raise
         else:
-            return title, content
+            return title, content, author, pub_date
 
-    def text_to_speech(self, text: str, title: str) -> bytes:
-        """Convert text to speech using OpenAI TTS API.
+    def text_to_speech(
+        self,
+        text: str,
+        title: str,
+        author: str | None = None,
+        pub_date: str | None = None,
+    ) -> bytes:
+        """Convert text to speech with intro/outro using OpenAI TTS API.
 
         Uses parallel processing for chunks while maintaining order.
+        Adds intro with metadata and outro with attribution.
+
+        Args:
+            text: Article content to convert
+            title: Article title
+            author: Article author (optional)
+            pub_date: Publication date (optional)
 
         Raises:
             ValueError: If no chunks are generated from text.
         """
         try:
-            # Use LLM to prepare and chunk the text
+            # Generate intro audio
+            intro_text = self._create_intro_text(title, author, pub_date)
+            intro_audio = self._generate_tts_segment(intro_text)
+
+            # Generate outro audio
+            outro_text = self._create_outro_text(title, author)
+            outro_audio = self._generate_tts_segment(outro_text)
+
+            # Use LLM to prepare and chunk the main content
             chunks = prepare_text_for_tts(text, title)
 
             if not chunks:
@@ -212,54 +246,144 @@ class ArticleProcessor:
                     "processing",
                     mem_usage,
                 )
-                return self._text_to_speech_serial(chunks)
+                content_audio_bytes = self._text_to_speech_serial(chunks)
+            else:
+                # Determine max workers
+                max_workers = min(
+                    4,  # Reasonable limit to avoid rate limiting
+                    len(chunks),  # No more workers than chunks
+                    max(1, psutil.cpu_count() // 2),  # Use half of CPU cores
+                )
 
-            # Determine max workers based on chunk count and system resources
-            max_workers = min(
-                4,  # Reasonable limit to avoid rate limiting
-                len(chunks),  # No more workers than chunks
-                max(1, psutil.cpu_count() // 2),  # Use half of CPU cores
-            )
+                logger.info(
+                    "Using %d workers for parallel TTS processing",
+                    max_workers,
+                )
 
-            logger.info(
-                "Using %d workers for parallel TTS processing",
-                max_workers,
+                # Process chunks in parallel
+                chunk_results: list[tuple[int, bytes]] = []
+
+                with concurrent.futures.ThreadPoolExecutor(
+                    max_workers=max_workers,
+                ) as executor:
+                    # Submit all chunks for processing
+                    future_to_index = {
+                        executor.submit(self._process_tts_chunk, chunk, i): i
+                        for i, chunk in enumerate(chunks)
+                    }
+
+                    # Collect results as they complete
+                    for future in concurrent.futures.as_completed(
+                        future_to_index,
+                    ):
+                        index = future_to_index[future]
+                        try:
+                            audio_data = future.result()
+                            chunk_results.append((index, audio_data))
+                        except Exception:
+                            logger.exception(
+                                "Failed to process chunk %d",
+                                index,
+                            )
+                            raise
+
+                # Sort results by index to maintain order
+                chunk_results.sort(key=operator.itemgetter(0))
+
+                # Combine audio chunks
+                content_audio_bytes = self._combine_audio_chunks([
+                    data for _, data in chunk_results
+                ])
+
+            # Combine intro, content, and outro with pauses
+            return ArticleProcessor._combine_intro_content_outro(
+                intro_audio,
+                content_audio_bytes,
+                outro_audio,
             )
 
-            # Process chunks in parallel
-            chunk_results: list[tuple[int, bytes]] = []
-
-            with concurrent.futures.ThreadPoolExecutor(
-                max_workers=max_workers,
-            ) as executor:
-                # Submit all chunks for processing
-                future_to_index = {
-                    executor.submit(self._process_tts_chunk, chunk, i): i
-                    for i, chunk in enumerate(chunks)
-                }
-
-                # Collect results as they complete
-                for future in concurrent.futures.as_completed(future_to_index):
-                    index = future_to_index[future]
-                    try:
-                        audio_data = future.result()
-                        chunk_results.append((index, audio_data))
-                    except Exception:
-                        logger.exception("Failed to process chunk %d", index)
-                        raise
-
-            # Sort results by index to maintain order
-            chunk_results.sort(key=operator.itemgetter(0))
-
-            # Combine audio chunks
-            return self._combine_audio_chunks([
-                data for _, data in chunk_results
-            ])
-
         except Exception:
             logger.exception("TTS generation failed")
             raise
 
+    @staticmethod
+    def _create_intro_text(
+        title: str,
+        author: str | None,
+        pub_date: str | None,
+    ) -> str:
+        """Create intro text with available metadata."""
+        parts = [f"Title: {title}"]
+
+        if author:
+            parts.append(f"Author: {author}")
+
+        if pub_date:
+            parts.append(f"Published: {pub_date}")
+
+        return ". ".join(parts) + "."
+
+    @staticmethod
+    def _create_outro_text(title: str, author: str | None) -> str:
+        """Create outro text with attribution."""
+        if author:
+            return (
+                f"This has been an audio version of {title} "
+                f"by {author}, created using Podcast It Later."
+            )
+        return (
+            f"This has been an audio version of {title}, "
+            "created using Podcast It Later."
+        )
+
+    def _generate_tts_segment(self, text: str) -> bytes:
+        """Generate TTS audio for a single segment (intro/outro).
+
+        Args:
+            text: Text to convert to speech
+
+        Returns:
+            MP3 audio bytes
+        """
+        response = self.openai_client.audio.speech.create(
+            model=TTS_MODEL,
+            voice=TTS_VOICE,
+            input=text,
+        )
+        return response.content
+
+    @staticmethod
+    def _combine_intro_content_outro(
+        intro_audio: bytes,
+        content_audio: bytes,
+        outro_audio: bytes,
+    ) -> bytes:
+        """Combine intro, content, and outro with 1-second pauses.
+
+        Args:
+            intro_audio: MP3 bytes for intro
+            content_audio: MP3 bytes for main content
+            outro_audio: MP3 bytes for outro
+
+        Returns:
+            Combined MP3 audio bytes
+        """
+        # Load audio segments
+        intro = AudioSegment.from_mp3(io.BytesIO(intro_audio))
+        content = AudioSegment.from_mp3(io.BytesIO(content_audio))
+        outro = AudioSegment.from_mp3(io.BytesIO(outro_audio))
+
+        # Create 1-second silence
+        pause = AudioSegment.silent(duration=1000)  # milliseconds
+
+        # Combine segments with pauses
+        combined = intro + pause + content + pause + outro
+
+        # Export to bytes
+        output = io.BytesIO()
+        combined.export(output, format="mp3")
+        return output.getvalue()
+
     def _process_tts_chunk(self, chunk: str, index: int) -> bytes:
         """Process a single TTS chunk.
 
@@ -496,15 +620,17 @@ class ArticleProcessor:
                 return
 
             # Step 1: Extract article content
-            title, content = ArticleProcessor.extract_article_content(url)
+            title, content, author, pub_date = (
+                ArticleProcessor.extract_article_content(url)
+            )
 
             if self.shutdown_handler.is_shutdown_requested():
                 logger.info("Shutdown requested, aborting job %d", job_id)
                 Core.Database.update_job_status(job_id, "pending")
                 return
 
-            # Step 2: Generate audio
-            audio_data = self.text_to_speech(content, title)
+            # Step 2: Generate audio with metadata
+            audio_data = self.text_to_speech(content, title, author, pub_date)
 
             if self.shutdown_handler.is_shutdown_requested():
                 logger.info("Shutdown requested, aborting job %d", job_id)
@@ -922,12 +1048,16 @@ class TestArticleExtraction(Test.TestCase):
                 return_value=mock_result,
             ),
         ):
-            title, content = ArticleProcessor.extract_article_content(
-                "https://example.com",
+            title, content, author, pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
             )
 
         self.assertEqual(title, "Test Article")
         self.assertEqual(content, "Content here")
+        self.assertIsNone(author)
+        self.assertIsNone(pub_date)
 
     def test_extract_missing_title(self) -> None:
         """Handle articles without titles."""
@@ -944,12 +1074,16 @@ class TestArticleExtraction(Test.TestCase):
                 return_value=mock_result,
             ),
         ):
-            title, content = ArticleProcessor.extract_article_content(
-                "https://example.com",
+            title, content, author, pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
             )
 
         self.assertEqual(title, "Untitled Article")
         self.assertEqual(content, "Content without title")
+        self.assertIsNone(author)
+        self.assertIsNone(pub_date)
 
     def test_extract_empty_content(self) -> None:
         """Handle empty articles."""
@@ -1020,8 +1154,10 @@ class TestArticleExtraction(Test.TestCase):
                 return_value=mock_result,
             ),
         ):
-            _title, content = ArticleProcessor.extract_article_content(
-                "https://example.com",
+            _title, content, _author, _pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
             )
 
         self.assertEqual(content, "Good content")
@@ -1458,6 +1594,92 @@ class TestTextToSpeech(Test.TestCase):
         self.assertEqual(audio_data, b"ordered-audio")
 
 
+class TestIntroOutro(Test.TestCase):
+    """Test intro and outro generation with metadata."""
+
+    def test_create_intro_text_full_metadata(self) -> None:
+        """Test intro text creation with all metadata."""
+        intro = ArticleProcessor._create_intro_text(  # noqa: SLF001
+            title="Test Article",
+            author="John Doe",
+            pub_date="2024-01-15",
+        )
+        self.assertIn("Title: Test Article", intro)
+        self.assertIn("Author: John Doe", intro)
+        self.assertIn("Published: 2024-01-15", intro)
+
+    def test_create_intro_text_no_author(self) -> None:
+        """Test intro text without author."""
+        intro = ArticleProcessor._create_intro_text(  # noqa: SLF001
+            title="Test Article",
+            author=None,
+            pub_date="2024-01-15",
+        )
+        self.assertIn("Title: Test Article", intro)
+        self.assertNotIn("Author:", intro)
+        self.assertIn("Published: 2024-01-15", intro)
+
+    def test_create_intro_text_minimal(self) -> None:
+        """Test intro text with only title."""
+        intro = ArticleProcessor._create_intro_text(  # noqa: SLF001
+            title="Test Article",
+            author=None,
+            pub_date=None,
+        )
+        self.assertEqual(intro, "Title: Test Article.")
+
+    def test_create_outro_text_with_author(self) -> None:
+        """Test outro text with author."""
+        outro = ArticleProcessor._create_outro_text(  # noqa: SLF001
+            title="Test Article",
+            author="Jane Smith",
+        )
+        self.assertIn("Test Article", outro)
+        self.assertIn("Jane Smith", outro)
+        self.assertIn("Podcast It Later", outro)
+
+    def test_create_outro_text_no_author(self) -> None:
+        """Test outro text without author."""
+        outro = ArticleProcessor._create_outro_text(  # noqa: SLF001
+            title="Test Article",
+            author=None,
+        )
+        self.assertIn("Test Article", outro)
+        self.assertNotIn("by", outro)
+        self.assertIn("Podcast It Later", outro)
+
+    def test_extract_with_metadata(self) -> None:
+        """Test that extraction returns metadata."""
+        mock_html = "<html><body><p>Content</p></body></html>"
+        mock_result = json.dumps({
+            "title": "Test Article",
+            "text": "Article content",
+            "author": "Test Author",
+            "date": "2024-01-15",
+        })
+
+        with (
+            unittest.mock.patch(
+                "trafilatura.fetch_url",
+                return_value=mock_html,
+            ),
+            unittest.mock.patch(
+                "trafilatura.extract",
+                return_value=mock_result,
+            ),
+        ):
+            title, content, author, pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
+            )
+
+        self.assertEqual(title, "Test Article")
+        self.assertEqual(content, "Article content")
+        self.assertEqual(author, "Test Author")
+        self.assertEqual(pub_date, "2024-01-15")
+
+
 class TestMemoryEfficiency(Test.TestCase):
     """Test memory-efficient processing."""
 
@@ -1494,8 +1716,10 @@ class TestMemoryEfficiency(Test.TestCase):
                 return_value=mock_result,
             ),
         ):
-            title, content = ArticleProcessor.extract_article_content(
-                "https://example.com",
+            title, content, _author, _pub_date = (
+                ArticleProcessor.extract_article_content(
+                    "https://example.com",
+                )
             )
 
         self.assertEqual(title, "Large Article")
@@ -1558,7 +1782,12 @@ class TestJobProcessing(Test.TestCase):
             unittest.mock.patch.object(
                 ArticleProcessor,
                 "extract_article_content",
-                return_value=("Test Title", "Test content"),
+                return_value=(
+                    "Test Title",
+                    "Test content",
+                    "Test Author",
+                    "2024-01-15",
+                ),
             ),
             unittest.mock.patch.object(
                 ArticleProcessor,
@@ -1727,7 +1956,12 @@ class TestJobProcessing(Test.TestCase):
             unittest.mock.patch.object(
                 ArticleProcessor,
                 "extract_article_content",
-                return_value=("Test Title", "Test content"),
+                return_value=(
+                    "Test Title",
+                    "Test content",
+                    "Test Author",
+                    "2024-01-15",
+                ),
             ),
             unittest.mock.patch.object(
                 ArticleProcessor,
author	Ben Sima <ben@bsima.me>	2025-11-16 03:47:16 -0500
committer	Ben Sima <ben@bsima.me>	2025-11-16 03:47:16 -0500
commit	f74ee8bc380f07e597b638a719e7bbfe9461a031 (patch)
tree	7562c8f38d87c7743d74b84012ba8bffc843b0e2
parent	081f0759b37452bb1319c4f5f88a1d451a5177a9 (diff)