'How can I efficiently convert gtts audio into pydub audiosegments?
I want to manipulate gtts audio in pydub but I am not sure how to translate gtts file like to pydub audio.
I know that I can convert google text to speech audio to an mp3 and I know that I can import an mp3 with pydub, but this process is inefficient. Is there a way to skip creating mp3 file and reading it back in?
I tried using the gtts file-like as the argument in AudioSegment.from_mp3()
but I am pretty sure it is looking for a string.
from gtts import gTTS
from io import BytesIO
from pydub import AudioSegment
mp3_fp = BytesIO()
tts = gTTS('hello', 'en')
tts.write_to_fp(mp3_fp)
song = AudioSegment.from_mp3(mp3_fp)
I got a "CouldntDecodeError":
File "C:\ProgramData\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/py/ex/gtts_test.py", line 18, in <module>
song = AudioSegment.from_mp3(mp3_fp)
File "C:\ProgramData\Anaconda3\lib\site-packages\pydub\audio_segment.py", line 716, in from_mp3
return cls.from_file(file, 'mp3', parameters=parameters)
File "C:\ProgramData\Anaconda3\lib\site-packages\pydub\audio_segment.py", line 704, in from_file
p.returncode, p_err))
CouldntDecodeError: Decoding failed. ffmpeg returned error code: 1
Output from ffmpeg/avlib:
b'ffmpeg version 4.1.1 Copyright (c) 2000-2019 the FFmpeg developers\r\n built with gcc 8.2.1 (GCC) 20190212\r\n configuration: --enable-gpl --enable-version3 --enable-sdl2 --enable-fontconfig --enable-gnutls --enable-iconv --enable-libass --enable-libbluray --enable-libfreetype --enable-libmp3lame --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libopus --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libtheora --enable-libtwolame --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libzimg --enable-lzma --enable-zlib --enable-gmp --enable-libvidstab --enable-libvorbis --enable-libvo-amrwbenc --enable-libmysofa --enable-libspeex --enable-libxvid --enable-libaom --enable-libmfx --enable-amf --enable-ffnvcodec --enable-cuvid --enable-d3d11va --enable-nvenc --enable-nvdec --enable-dxva2 --enable-avisynth\r\n libavutil 56. 22.100 / 56. 22.100\r\n libavcodec 58. 35.100 / 58. 35.100\r\n libavformat 58. 20.100 / 58. 20.100\r\n libavdevice 58. 5.100 / 58. 5.100\r\n libavfilter 7. 40.101 / 7. 40.101\r\n libswscale 5. 3.100 / 5. 3.100\r\n libswresample 3. 3.100 / 3. 3.100\r\n libpostproc 55. 3.100 / 55. 3.100\r\n[mp3 @ 000001da0c1292c0] Failed to read frame size: Could not seek to 1026.\r\npipe:: Invalid argument\r\n'
Solution 1:[1]
You must seek the BytesIO object to 0!
from gtts import gTTS
from io import BytesIO
from pydub import AudioSegment
mp3_fp = BytesIO()
tts = gTTS('hello', 'en')
tts.write_to_fp(mp3_fp)
mp3_fp.seek(0)
song = AudioSegment.from_mp3(mp3_fp)
Solution 2:[2]
if you have very long text like wikipeda summary it can take too long to convert it to the sound. the script below is taking long text and splitting to the sentences and converting them to sound and adding one mp3 file also i added funny part to change sound speed and make it like child voice.have fun...
from pydub import AudioSegment
from gtts import gTTS
from io import BytesIO
import pygame
from nltk.tokenize import sent_tokenize
from queue import Queue # Python 3 import
import threading
sentence_queue = Queue()
audio_queue = Queue()
def split_sentence(text):
# text = text.replace('.', '. ')
text = text.replace('!', '. ')
text = text.replace('?', '. ')
sentences = sent_tokenize(text)
for sentence in sentences:
sentence_queue.put(sentence)
sentence_queue.put(None)
def speed_swifter(sound, speed=1.0):
sound_with_altered_frame_rate = sound._spawn(
sound.raw_data, overrides={"frame_rate": int(sound.frame_rate * speed)})
return sound_with_altered_frame_rate
def recognize_worker():
# this runs in a background thread
i = 1
while True:
mp3_fp = BytesIO()
# retrieve the next sentense job from the main thread
sentence = sentence_queue.get()
if sentence is None:
break
# received sentence data, now we'll convert it using Google
try:
speech = gTTS(text=sentence, lang="tr", slow=False)
speech.write_to_fp(mp3_fp) # save to IO bytes
except:
print("gtts çal??mad?")
sentence_queue.task_done() # mark the audio processing job as completed in the queue
mp3_fp.seek(0) # BYTE IO rewind ?
song = AudioSegment.from_mp3(mp3_fp) # convert to audio
slower_sound = speed_swifter(song, 1.3)
slower_sound.export('assets/sound_out/'+str(i)+'.mp3', format="mp3")
audio_queue.put('assets/sound_out/'+str(i)+'.mp3')
i += 1
audio_queue.put(None)
def player(filename):
pygame.mixer.init()
pygame.mixer.music.load(filename)
pygame.mixer.music.set_volume(1.0)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy() == True:
pass
pygame.mixer.music.unload()
os.remove(filename)
def say(context):
split_sentence(context)
threading.Thread(target=recognize_worker).start()
while True:
mp3 = audio_queue.get()
if mp3 is None:
break
player(mp3)
audio_queue.task_done()
if __name__ == "__main__":
context = 'Murat Boz. Türk ?ark?c?. söz yazar?. ve oyuncudur.ürün bekledi?imizden. çok çok iyi. gerek görüntü. kalitesi gerek de ses kalitesi gayet güzel. dü?ünmeden alabilirsiniz '
say(context)
``
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | Chang Hwan Lee |
Solution 2 |