InfiniteTalk turns a reference portrait and audio into a realistic talking-head video with lip-sync, supporting up to 10-minute audio in 480p or 720p.

InfiniteTalk turns a reference portrait and audio into a realistic talking-head video with lip-sync, supporting up to 10-minute audio in 480p or 720p.
แต่ละครั้งจะใช้ $0.03 ด้วย $10 คุณสามารถรันได้ประมาณ 333 ครั้ง
คุณสามารถทำต่อได้:
import requests
import time
# Step 1: Start video generation
generate_url = "https://api.atlascloud.ai/api/v1/model/generateVideo"
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer $ATLASCLOUD_API_KEY"
}
data = {
"model": "atlascloud/infinitetalk",
"prompt": "A beautiful sunset over the ocean with gentle waves",
"width": 512,
"height": 512,
"duration": 3,
"fps": 24,
}
generate_response = requests.post(generate_url, headers=headers, json=data)
generate_result = generate_response.json()
prediction_id = generate_result["data"]["id"]
# Step 2: Poll for result
poll_url = f"https://api.atlascloud.ai/api/v1/model/prediction/{prediction_id}"
def check_status():
while True:
response = requests.get(poll_url, headers={"Authorization": "Bearer $ATLASCLOUD_API_KEY"})
result = response.json()
if result["data"]["status"] in ["completed", "succeeded"]:
print("Generated video:", result["data"]["outputs"][0])
return result["data"]["outputs"][0]
elif result["data"]["status"] == "failed":
raise Exception(result["data"]["error"] or "Generation failed")
else:
# Still processing, wait 2 seconds
time.sleep(2)
video_url = check_status()ติดตั้งแพ็คเกจที่จำเป็นสำหรับภาษาของคุณ
pip install requestsคำขอ API ทั้งหมดต้องมีการยืนยันตัวตนผ่าน API key คุณสามารถรับ API key ได้จากแดชบอร์ด Atlas Cloud
export ATLASCLOUD_API_KEY="your-api-key-here"import os
API_KEY = os.environ.get("ATLASCLOUD_API_KEY")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}"
}อย่าเปิดเผย API key ของคุณในโค้ดฝั่งไคลเอนต์หรือที่เก็บข้อมูลสาธารณะ ให้ใช้ตัวแปรสภาพแวดล้อมหรือพร็อกซีฝั่งเซิร์ฟเวอร์แทน
import requests
url = "https://api.atlascloud.ai/api/v1/model/generateVideo"
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer $ATLASCLOUD_API_KEY"
}
data = {
"model": "your-model",
"prompt": "A beautiful landscape"
}
response = requests.post(url, headers=headers, json=data)
print(response.json())ส่งคำขอสร้างแบบอะซิงโครนัส API จะส่งคืน prediction ID ที่คุณสามารถใช้ตรวจสอบสถานะและดึงผลลัพธ์ได้
/api/v1/model/generateVideoimport requests
url = "https://api.atlascloud.ai/api/v1/model/generateVideo"
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer $ATLASCLOUD_API_KEY"
}
data = {
"model": "atlascloud/infinitetalk",
"input": {
"prompt": "A beautiful sunset over the ocean with gentle waves"
}
}
response = requests.post(url, headers=headers, json=data)
result = response.json()
print(f"Prediction ID: {result['id']}")
print(f"Status: {result['status']}"){
"id": "pred_abc123",
"status": "processing",
"model": "model-name",
"created_at": "2025-01-01T00:00:00Z"
}สำรวจ prediction endpoint เพื่อตรวจสอบสถานะปัจจุบันของคำขอ
/api/v1/model/prediction/{prediction_id}import requests
import time
prediction_id = "pred_abc123"
url = f"https://api.atlascloud.ai/api/v1/model/prediction/{prediction_id}"
headers = { "Authorization": "Bearer $ATLASCLOUD_API_KEY" }
while True:
response = requests.get(url, headers=headers)
result = response.json()
status = result["data"]["status"]
print(f"Status: {status}")
if status in ["completed", "succeeded"]:
output_url = result["data"]["outputs"][0]
print(f"Output URL: {output_url}")
break
elif status == "failed":
print(f"Error: {result['data'].get('error', 'Unknown')}")
break
time.sleep(3)processingคำขอยังอยู่ระหว่างการประมวลผลcompletedการสร้างเสร็จสมบูรณ์แล้ว ผลลัพธ์พร้อมใช้งานsucceededการสร้างสำเร็จแล้ว ผลลัพธ์พร้อมใช้งานfailedการสร้างล้มเหลว ตรวจสอบฟิลด์ error{
"data": {
"id": "pred_abc123",
"status": "completed",
"outputs": [
"https://storage.atlascloud.ai/outputs/result.mp4"
],
"metrics": {
"predict_time": 45.2
},
"created_at": "2025-01-01T00:00:00Z",
"completed_at": "2025-01-01T00:00:10Z"
}
}อัปโหลดไฟล์ไปยังที่เก็บข้อมูล Atlas Cloud และรับ URL ที่คุณสามารถใช้ในคำขอ API ของคุณ ใช้ multipart/form-data ในการอัปโหลด
/api/v1/model/uploadMediaimport requests
url = "https://api.atlascloud.ai/api/v1/model/uploadMedia"
headers = { "Authorization": "Bearer $ATLASCLOUD_API_KEY" }
with open("image.png", "rb") as f:
files = {"file": ("image.png", f, "image/png")}
response = requests.post(url, headers=headers, files=files)
result = response.json()
download_url = result["data"]["download_url"]
print(f"File URL: {download_url}"){
"data": {
"download_url": "https://storage.atlascloud.ai/uploads/abc123/image.png",
"file_name": "image.png",
"content_type": "image/png",
"size": 1024000
}
}พารามิเตอร์ต่อไปนี้ยอมรับในเนื้อหาคำขอ
ไม่มีพารามิเตอร์ที่ใช้ได้
{
"model": "atlascloud/infinitetalk"
}API จะส่งคืนการตอบกลับ prediction พร้อม URL ของผลลัพธ์ที่สร้างขึ้น
{
"id": "pred_abc123",
"status": "completed",
"model": "model-name",
"outputs": [
"https://storage.atlascloud.ai/outputs/result.mp4"
],
"metrics": {
"predict_time": 45.2
},
"created_at": "2025-01-01T00:00:00Z",
"completed_at": "2025-01-01T00:00:10Z"
}Atlas Cloud Skills เชื่อมต่อโมเดล AI กว่า 300+ เข้ากับผู้ช่วยเขียนโค้ด AI ของคุณโดยตรง ติดตั้งด้วยคำสั่งเดียว จากนั้นใช้ภาษาธรรมชาติเพื่อสร้างรูปภาพ วิดีโอ และสนทนากับ LLM
npx skills add AtlasCloudAI/atlas-cloud-skillsรับ API key จากแดชบอร์ด Atlas Cloud และตั้งค่าเป็นตัวแปรสภาพแวดล้อม
export ATLASCLOUD_API_KEY="your-api-key-here"เมื่อติดตั้งแล้ว คุณสามารถใช้ภาษาธรรมชาติในผู้ช่วย AI ของคุณเพื่อเข้าถึงโมเดล Atlas Cloud ทั้งหมด
Atlas Cloud MCP Server เชื่อมต่อ IDE ของคุณกับโมเดล AI กว่า 300+ ผ่าน Model Context Protocol ใช้งานได้กับไคลเอนต์ที่รองรับ MCP ทุกตัว
npx -y atlascloud-mcpเพิ่มการกำหนดค่าต่อไปนี้ลงในไฟล์ตั้งค่า MCP ของ IDE ของคุณ
{
"mcpServers": {
"atlascloud": {
"command": "npx",
"args": [
"-y",
"atlascloud-mcp"
],
"env": {
"ATLASCLOUD_API_KEY": "your-api-key-here"
}
}
}
}ไม่มี SchemaInfiniteTalk is an audio-driven video generation model developed by AtlasCloud that transforms a single portrait image into a realistic talking-head video synchronized to any speech audio input. Built on a modified Wan2.1 I2V-14B diffusion transformer backbone with a dedicated audio cross-attention module, InfiniteTalk achieves phoneme-level lip synchronization while preserving the subject's identity, hairstyle, clothing, and background throughout the entire video.
InfiniteTalk's core innovation lies in its triple cross-attention architecture: each transformer block processes visual self-attention, text prompt cross-attention, and frame-level audio cross-attention in sequence, enabling precise per-frame audio-visual alignment. Combined with a streaming inference pipeline that processes video in overlapping segments, InfiniteTalk supports continuous video generation of up to 10 minutes from a single request — far exceeding the typical 5–15 second limit of conventional image-to-video models. The model also supports dual-person mode, animating two speakers simultaneously within the same frame using separate audio tracks and bounding box annotations.
Triple Cross-Attention Audio Conditioning: Unlike text-only conditioned video models, InfiniteTalk injects audio embeddings at every transformer block via a dedicated cross-attention layer. Audio features are extracted frame-by-frame using a Wav2Vec2 encoder, providing per-frame speech signal anchoring that drives natural mouth movements, facial micro-expressions, and head motion synchronized to the audio input.
Streaming Long-Form Video Generation: InfiniteTalk's streaming mode processes audio in overlapping clip segments with configurable motion frame overlap, automatically concatenating segments into seamless long-form video. This enables generation of minutes-long talking videos without quality degradation or identity drift — a capability not available in standard image-to-video pipelines limited to single-shot outputs.
High-Fidelity Identity Preservation: The model maintains consistent facial identity, hairstyle, clothing texture, and background composition across the entire generated video. The audio conditioning signal provides strong per-frame constraints that prevent the identity drift commonly observed in long unconditional video generation.
Dual-Person Conversation Mode: InfiniteTalk supports animating two speakers in a single scene by accepting separate audio tracks and bounding box coordinates for each person. This enables realistic conversation scenarios, interview formats, and dialogue-driven content without requiring separate generation passes or post-production compositing.
Flexible Input Modalities: The model accepts either a static portrait image or a reference video as the visual source, combined with audio in WAV or MP3 format. Text prompts provide additional guidance for expression style, posture, and behavioral nuance, giving creators fine-grained control over the generated output.
Conditional VSR Upscaling: When generating at 720p resolution with audio duration under 60 seconds, InfiniteTalk automatically routes output through a FlashVSR super-resolution pipeline, delivering enhanced visual clarity without additional user configuration or cost management.
InfiniteTalk is built on the Wan2.1 I2V-14B foundation model (14 billion parameters, 480p native resolution) with custom InfiniteTalk adapter weights that introduce the audio cross-attention pathway. The audio encoder uses a Chinese-Wav2Vec2-Base model that extracts frame-aligned speech embeddings at 25 fps video rate, creating a one-to-one correspondence between audio features and generated video frames.
The inference pipeline operates in two modes. In clip mode, the model generates a single video segment of up to 81 frames (approximately 3.2 seconds at 25 fps), suitable for short-form content. In streaming mode, the model iteratively generates overlapping clips with a configurable motion frame overlap (default: 9 frames), seamlessly blending segments to produce arbitrarily long video bounded only by the input audio duration and a configurable maximum frame limit.
The diffusion process uses a configurable number of denoising steps (default: 40, tunable from 1–100) with TeaCache acceleration for improved throughput. On NVIDIA H200 hardware, each 81-frame clip requires approximately 3.5 minutes of processing time, yielding a generation-to-output ratio of roughly 10–30× depending on resolution and hardware load.
For 720p output, the system employs a two-stage pipeline: base generation at 480p followed by conditional FlashVSR 4× upscaling (target: 921,600 pixels at 25 fps), applied automatically when audio duration is 60 seconds or less.
InfiniteTalk addresses a specific niche — audio-driven talking-head video — that differs from general-purpose text-to-video or image-to-video models. Its performance should be evaluated primarily on lip-sync accuracy, identity consistency, and long-form stability rather than visual diversity or cinematic motion range.
| Capability | InfiniteTalk | General I2V Models | Dedicated Lip-Sync Tools |
|---|---|---|---|
| Lip-sync accuracy | Phoneme-level, multi-language | N/A (no audio input) | Word-level, often English-only |
| Maximum duration | Up to 10 minutes (streaming) | 5–15 seconds typical | 30–60 seconds typical |
| Identity preservation | High (audio-anchored per-frame) | Moderate (drift in longer clips) | Moderate |
| Dual-person support | Native | Not available | Rare |
| Resolution | 480p native, 720p with VSR | Up to 1080p | Varies |
| Audio input | Any language WAV/MP3 | N/A | Usually English TTS |
InfiniteTalk achieves strong lip-sync fidelity across Chinese, English, Japanese, and other languages tested, owing to the language-agnostic Wav2Vec2 audio feature extraction. Identity drift is minimal even in 5+ minute generations due to the per-frame audio conditioning anchor.
Digital Avatar & Virtual Presenter: Create realistic talking-head videos for virtual hosts, AI assistants, and digital spokespersons using a single photo and recorded or synthesized speech audio.
Video Dubbing & Localization: Generate lip-synced video from translated audio tracks, enabling cost-effective multilingual content adaptation without re-filming or manual lip-sync editing.
Online Education & Training: Produce instructor-led video content at scale from lecture audio recordings and a single instructor photograph, reducing video production costs for e-learning platforms.
Podcast & Interview Visualization: Transform audio-only podcast or interview recordings into engaging video content with realistic speaker animations, suitable for social media distribution.
Customer Service & Chatbot Video: Generate personalized video responses driven by TTS audio output, enabling human-like video communication in automated customer interaction flows.
Social Media Content at Scale: Rapidly produce talking-head content for influencer accounts, news summaries, or commentary formats using text-to-speech pipelines combined with InfiniteTalk video generation.