Creating a video avatar generator completely locally without cloud services or API keys is challenging but possible. Here’s a step-by-step guide to build a system that generates talking video avatars using locally-run models.

Prerequisites

Step 1: Set Up Ollama for Local LLM

Step 2: Install Local Text-to-Speech Engine

mkdir -p ~/.local/share/piper-tts/voices cd ~/.local/share/piper-tts/voices wget https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx wget https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/medium/en_US-lessac-medium.onnx.json

Step 3: Install Wav2Lip for Avatar Animation

Step 4: Install FFmpeg for Video Processing

sudo apt-get install -y ffmpeg

Step 5: Create the Pipeline Script

#!/usr/bin/env python3
import os
import subprocess
import argparse
import tempfile
import json

def generate_script(prompt):
    """Generate script text using Ollama"""
    print("Generating script with Ollama...")
    cmd = f'ollama run llama3 "{prompt}"'
    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
    return result.stdout.strip()

def text_to_speech(text, output_wav):
    """Convert text to speech using Piper"""
    print("Converting text to speech...")
    voice_model = os.path.expanduser("~/.local/share/piper-tts/voices/en_US-lessac-medium.onnx")
    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt') as f:
        f.write(text)
        f.flush()
        cmd = f'cat {f.name} | piper --model {voice_model} --output_file {output_wav}'
        subprocess.run(cmd, shell=True)

def animate_avatar(face_path, audio_path, output_video):
    """Animate the avatar using Wav2Lip"""
    print("Animating avatar...")
    wav2lip_path = os.path.expanduser("~/Wav2Lip")
    os.chdir(wav2lip_path)
    cmd = f'python inference.py --checkpoint_path checkpoints/wav2lip.pth --face {face_path} --audio {audio_path} --outfile {output_video} --nosmooth'
    subprocess.run(cmd, shell=True)

def main():
    parser = argparse.ArgumentParser(description='Generate a talking avatar video locally')
    parser.add_argument('--prompt', required=True, help='Prompt for script generation')
    parser.add_argument('--face', required=True, help='Path to face image or video')
    parser.add_argument('--output', default='output.mp4', help='Output video path')
    args = parser.parse_args()
    
    # Create temporary directory for intermediate files
    with tempfile.TemporaryDirectory() as tmpdir:
        script_file = os.path.join(tmpdir, 'script.txt')
        audio_file = os.path.join(tmpdir, 'speech.wav')
        
        # Generate script
        script = generate_script(args.prompt)
        with open(script_file, 'w') as f:
            f.write(script)
        print(f"Generated script:\n{script}\n")
        
        # Convert script to speech
        text_to_speech(script, audio_file)
        
        # Animate avatar
        animate_avatar(args.face, audio_file, args.output)
        
        print(f"Video generated and saved to {args.output}")

if __name__ == "__main__":
    main()

chmod +x avatar_generator.py

Step 6: Run the Avatar Generator

Step 7: Add Optional Captions (Using Local Whisper)

#!/usr/bin/env python3 import whisper import subprocess import os import argparse def generate_captions(audio_file): """Generate captions using Whisper""" print("Generating captions...") model = whisper.load_model("base") result = model.transcribe(audio_file) return result["text"] def add_captions_to_video(video_file, caption_text, output_file): """Add captions to video using FFmpeg""" print("Adding captions to video...") with open("captions.srt", "w") as f: f.write("1\n00:00:00,000 --> 00:05:00,000\n" + caption_text) cmd = f'ffmpeg -i {video_file} -vf subtitles=captions.srt {output_file}' subprocess.run(cmd, shell=True) os.remove("captions.srt") def main(): parser = argparse.ArgumentParser(description='Add captions to a video') parser.add_argument('--video', required=True, help='Input video file') parser.add_argument('--audio', required=True, help='Input audio file for transcription') parser.add_argument('--output', default='captioned_video.mp4', help='Output video path') args = parser.parse_args() captions = generate_captions(args.audio) add_captions_to_video(args.video, captions, args.output) print(f"Captioned video saved to {args.output}") if __name__ == "__main__": main()

Troubleshooting Tips

Conclusion

All components run locally without any cloud services, online dependencies, or API keys. This approach gives you full control over the process and protects your privacy, though it requires more computational resources than cloud-based alternatives.

Building a Local Video Avatar Generator Using Ollama and Open-Source Tools