telegram: fix audio transcription model and prompt order

- Switch from gemini-2.0-flash-001 to gemini-2.5-flash - Put audio content before text prompt (model was ignoring audio) - Strengthen prompt to return only transcription
author: Ben Sima <ben@bensima.com> 2025-12-13 13:28:59 -0500
committer: Ben Sima <ben@bensima.com> 2025-12-13 13:28:59 -0500
commit: a14881ddcdd6ce83250c978d9df825c29e8d93c6 (patch)
tree: 68b355d078c05e37e0ac3267f41fdf8656f22b43 /Omni/Agent/Telegram
parent: 4d21f170cd1d1df239d7ad00fbf79427769a140f (diff)
1 files changed, 5 insertions, 5 deletions
diff --git a/Omni/Agent/Telegram/Media.hs b/Omni/Agent/Telegram/Media.hs
index 137d7d3..6539b79 100644
--- a/Omni/Agent/Telegram/Media.hs
+++ b/Omni/Agent/Telegram/Media.hs
@@ -274,22 +274,22 @@ transcribeVoice apiKey audioBytes = do
   let base64Data = TL.toStrict (TLE.decodeUtf8 (B64.encode audioBytes))
       body =
         Aeson.object
-          [ "model" .= ("google/gemini-2.0-flash-001" :: Text),
+          [ "model" .= ("google/gemini-2.5-flash" :: Text),
             "messages"
               .= [ Aeson.object
                      [ "role" .= ("user" :: Text),
                        "content"
                          .= [ Aeson.object
-                                [ "type" .= ("text" :: Text),
-                                  "text" .= ("transcribe this audio exactly, return only the transcription with no commentary" :: Text)
-                                ],
-                              Aeson.object
                                 [ "type" .= ("input_audio" :: Text),
                                   "input_audio"
                                     .= Aeson.object
                                       [ "data" .= base64Data,
                                         "format" .= ("ogg" :: Text)
                                       ]
+                                ],
+                              Aeson.object
+                                [ "type" .= ("text" :: Text),
+                                  "text" .= ("transcribe this audio exactly. return ONLY the transcription, no commentary or preamble." :: Text)
                                 ]
                             ]
                      ]
author	Ben Sima <ben@bensima.com>	2025-12-13 13:28:59 -0500
committer	Ben Sima <ben@bensima.com>	2025-12-13 13:28:59 -0500
commit	a14881ddcdd6ce83250c978d9df825c29e8d93c6 (patch)
tree	68b355d078c05e37e0ac3267f41fdf8656f22b43 /Omni/Agent/Telegram
parent	4d21f170cd1d1df239d7ad00fbf79427769a140f (diff)