1 files changed, 211 insertions, 0 deletions
diff --git a/Biz/PodcastItLater/Worker/TextProcessing.py b/Biz/PodcastItLater/Worker/TextProcessing.py
new file mode 100644
index 0000000..52a7375
--- /dev/null
+++ b/Biz/PodcastItLater/Worker/TextProcessing.py
@@ -0,0 +1,211 @@
+"""Text processing utilities for TTS conversion."""
+
+# : dep openai
+# : dep pytest
+import logging
+import Omni.App as App
+import Omni.Log as Log
+import Omni.Test as Test
+import openai
+import os
+import sys
+
+logger = logging.getLogger(__name__)
+Log.setup(logger)
+
+# Configuration from environment variables
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+
+
+def prepare_text_for_tts(text: str, title: str) -> list[str]:
+    """Use LLM to prepare text for TTS, returning chunks ready for speech.
+
+    First splits text mechanically, then has LLM edit each chunk.
+    """
+    # First, split the text into manageable chunks
+    raw_chunks = split_text_into_chunks(text, max_chars=3000)
+
+    logger.info("Split article into %d raw chunks", len(raw_chunks))
+
+    # Prepare the first chunk with intro
+    edited_chunks = []
+
+    for i, chunk in enumerate(raw_chunks):
+        is_first = i == 0
+        is_last = i == len(raw_chunks) - 1
+
+        try:
+            edited_chunk = edit_chunk_for_speech(
+                chunk,
+                title=title if is_first else None,
+                is_first=is_first,
+                is_last=is_last,
+            )
+            edited_chunks.append(edited_chunk)
+        except Exception:
+            logger.exception("Failed to edit chunk %d", i + 1)
+            # Fall back to raw chunk if LLM fails
+            if is_first:
+                edited_chunks.append(
+                    f"This is an audio version of {title}. {chunk}",
+                )
+            elif is_last:
+                edited_chunks.append(f"{chunk} This concludes the article.")
+            else:
+                edited_chunks.append(chunk)
+
+    return edited_chunks
+
+
+def split_text_into_chunks(text: str, max_chars: int = 3000) -> list[str]:
+    """Split text into chunks at sentence boundaries."""
+    chunks = []
+    current_chunk = ""
+
+    # Split into paragraphs first
+    paragraphs = text.split("\n\n")
+
+    for para in paragraphs:
+        para_stripped = para.strip()
+        if not para_stripped:
+            continue
+
+        # If paragraph itself is too long, split by sentences
+        if len(para_stripped) > max_chars:
+            sentences = para_stripped.split(". ")
+            for sentence in sentences:
+                if len(current_chunk) + len(sentence) + 2 < max_chars:
+                    current_chunk += sentence + ". "
+                else:
+                    if current_chunk:
+                        chunks.append(current_chunk.strip())
+                    current_chunk = sentence + ". "
+        # If adding this paragraph would exceed limit, start new chunk
+        elif len(current_chunk) + len(para_stripped) + 2 > max_chars:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = para_stripped + " "
+        else:
+            current_chunk += para_stripped + " "
+
+    # Don't forget the last chunk
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+
+    return chunks
+
+
+def edit_chunk_for_speech(
+    chunk: str,
+    title: str | None = None,
+    *,
+    is_first: bool = False,
+    is_last: bool = False,
+) -> str:
+    """Use LLM to lightly edit a single chunk for speech.
+
+    Raises:
+        ValueError: If no content is returned from LLM.
+    """
+    system_prompt = (
+        "You are a podcast script editor. Your job is to lightly edit text "
+        "to make it sound natural when spoken aloud.\n\n"
+        "Guidelines:\n"
+    )
+    system_prompt += """
+- Remove URLs and email addresses, replacing with descriptive phrases
+- Convert bullet points and lists into flowing sentences
+- Fix any awkward phrasing for speech
+- Remove references like "click here" or "see below"
+- Keep edits minimal - preserve the original content and style
+- Do NOT add commentary or explanations
+- Return ONLY the edited text, no JSON or formatting
+"""
+
+    user_prompt = chunk
+
+    # Add intro/outro if needed
+    if is_first and title:
+        user_prompt = (
+            f"Add a brief intro mentioning this is an audio version of "
+            f"'{title}', then edit this text:\n\n{chunk}"
+        )
+    elif is_last:
+        user_prompt = f"Edit this text and add a brief closing:\n\n{chunk}"
+
+    try:
+        client: openai.OpenAI = openai.OpenAI(api_key=OPENAI_API_KEY)
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=0.3,  # Lower temperature for more consistent edits
+            max_tokens=4000,
+        )
+
+        content = response.choices[0].message.content
+        if not content:
+            msg = "No content returned from LLM"
+            raise ValueError(msg)  # noqa: TRY301
+
+        # Ensure the chunk isn't too long
+        max_chunk_length = 4000
+        if len(content) > max_chunk_length:
+            # Truncate at sentence boundary
+            sentences = content.split(". ")
+            truncated = ""
+            for sentence in sentences:
+                if len(truncated) + len(sentence) + 2 < max_chunk_length:
+                    truncated += sentence + ". "
+                else:
+                    break
+            content = truncated.strip()
+
+    except Exception:
+        logger.exception("LLM chunk editing failed")
+        raise
+    else:
+        return content
+
+
+class TestTextChunking(Test.TestCase):
+    """Test text chunking edge cases."""
+
+    def test_split_text_single_long_word(self) -> None:
+        """Handle text with a single word exceeding limit."""
+        long_word = "a" * 4000
+        chunks = split_text_into_chunks(long_word, max_chars=3000)
+
+        # Should keep it as one chunk or split?
+        # The current implementation does not split words
+        self.assertEqual(len(chunks), 1)
+        self.assertEqual(len(chunks[0]), 4000)
+
+    def test_split_text_no_sentence_boundaries(self) -> None:
+        """Handle long text with no sentence boundaries."""
+        text = "word " * 1000  # 5000 chars
+        chunks = split_text_into_chunks(text, max_chars=3000)
+
+        # Should keep it as one chunk as it can't split by ". "
+        self.assertEqual(len(chunks), 1)
+        self.assertGreater(len(chunks[0]), 3000)
+
+
+def test() -> None:
+    """Run the tests."""
+    Test.run(
+        App.Area.Test,
+        [
+            TestTextChunking,
+        ],
+    )
+
+
+def main() -> None:
+    """Entry point for the module."""
+    if "test" in sys.argv:
+        test()
+    else:
+        logger.info("TextProcessing module loaded")