Spaces:

Spanicin
/

aiavatar

Paused

App Files Files Community

Spanicin commited on May 3, 2024

Commit

6764da3

verified ·

1 Parent(s): 04c1e71

Upload 11 files

Browse files

Files changed (11) hide show

.gitignore +168 -0
LICENSE +21 -0
README.md +266 -10
app.py +374 -0
cog.yaml +35 -0
inference.py +159 -0
launcher.py +197 -0
predict.py +214 -0
quick_demo.ipynb +208 -0
requirements3d.txt +21 -0
webui.bat +17 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,168 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+examples/results/*
+gfpgan/*
+checkpoints/
+results/*
+Dockerfile
+start_docker.sh
+start.sh

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Tencent AI Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,10 +1,266 @@
----
-title: Aiavatar
-emoji: 👁
-colorFrom: red
-colorTo: gray
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<div align="center">
+<img src='https://user-images.githubusercontent.com/4397546/229094115-862c747e-7397-4b54-ba4a-bd368bfe2e0f.png' width='500px'/>
+<!--<h2> 😭 SadTalker： <span style="font-size:12px">Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation </span> </h2> -->
+  <a href='https://arxiv.org/abs/2211.12194'><img src='https://img.shields.io/badge/ArXiv-PDF-red'></a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<a href='https://sadtalker.github.io'><img src='https://img.shields.io/badge/Project-Page-Green'></a> &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/vinthony/SadTalker) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; [![Replicate](https://replicate.com/cjwbw/sadtalker/badge)](https://replicate.com/cjwbw/sadtalker)
+<div>
+    <a target='_blank'>Wenxuan Zhang <sup>*,1,2</sup> </a>&emsp;
+    <a href='https://vinthony.github.io/' target='_blank'>Xiaodong Cun <sup>*,2</a>&emsp;
+    <a href='https://xuanwangvc.github.io/' target='_blank'>Xuan Wang <sup>3</sup></a>&emsp;
+    <a href='https://yzhang2016.github.io/' target='_blank'>Yong Zhang <sup>2</sup></a>&emsp;
+    <a href='https://xishen0220.github.io/' target='_blank'>Xi Shen <sup>2</sup></a>&emsp; </br>
+    <a href='https://yuguo-xjtu.github.io/' target='_blank'>Yu Guo<sup>1</sup> </a>&emsp;
+    <a href='https://scholar.google.com/citations?hl=zh-CN&user=4oXBp9UAAAAJ' target='_blank'>Ying Shan <sup>2</sup> </a>&emsp;
+    <a target='_blank'>Fei Wang <sup>1</sup> </a>&emsp;
+</div>
+<br>
+<div>
+    <sup>1</sup> Xi'an Jiaotong University &emsp; <sup>2</sup> Tencent AI Lab &emsp; <sup>3</sup> Ant Group &emsp;
+</div>
+<br>
+<i><strong><a href='https://arxiv.org/abs/2211.12194' target='_blank'>CVPR 2023</a></strong></i>
+<br>
+<br>
+![sadtalker](https://user-images.githubusercontent.com/4397546/222490039-b1f6156b-bf00-405b-9fda-0c9a9156f991.gif)
+<b>TL;DR: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; single portrait image 🙎‍♂️  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;+  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; audio 🎤  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; =  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp; talking head video 🎞.</b>
+<br>
+</div>
+## 🔥 Highlight
+- 🔥 The extension of the [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) is online. Checkout more details [here](docs/webui_extension.md).
+https://user-images.githubusercontent.com/4397546/231495639-5d4bb925-ea64-4a36-a519-6389917dac29.mp4
+- 🔥 `full image mode` is online! checkout [here](https://github.com/Winfredy/SadTalker#full-bodyimage-generation) for more details.
+| still+enhancer in v0.0.1                 | still + enhancer   in v0.0.2       |   [input image @bagbag1815](https://twitter.com/bagbag1815/status/1642754319094108161) |
+|:--------------------: |:--------------------: | :----: |
+| <video  src="https://user-images.githubusercontent.com/48216707/229484996-5d7be64f-2553-4c9e-a452-c5cf0b8ebafe.mp4" type="video/mp4"> </video> | <video  src="https://user-images.githubusercontent.com/4397546/230717873-355b7bf3-d3de-49f9-a439-9220e623fce7.mp4" type="video/mp4"> </video>  | <img src='./examples/source_image/full_body_2.png' width='380'>
+- 🔥 Several new mode, eg, `still mode`, `reference mode`, `resize mode` are online for better and custom applications.
+- 🔥 Happy to  see more community demos at [bilibili](https://search.bilibili.com/all?keyword=sadtalker&from_source=webtop_search&spm_id_from=333.1007&search_source=3
+), [Youtube](https://www.youtube.com/results?search_query=sadtalker&sp=CAM%253D) and [twitter #sadtalker](https://twitter.com/search?q=%23sadtalker&src=typed_query).
+## 📋 Changelog (Previous changelog can be founded [here](docs/changlelog.md))
+- __[2023.04.15]__: Adding automatic1111 colab by @camenduru, thanks for this awesome colab: [![sd webui-colab](https://img.shields.io/badge/Automatic1111-Colab-green)](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb).
+- __[2023.04.12]__: adding a more detailed sd-webui installation document, fixed reinstallation problem.
+- __[2023.04.12]__: Fixed the sd-webui safe issues becasue of the 3rd packages, optimize the output path in `sd-webui-extension`.
+- __[2023.04.08]__: ❗️❗️❗️ In v0.0.2, we add a logo watermark to the generated video to prevent abusing since it is very realistic.
+- __[2023.04.08]__: v0.0.2, full image animation, adding baidu driver for download checkpoints. Optimizing the logic about enhancer.
+## 🚧 TODO
+<details><summary> Previous TODOs </summary>
+- [x] Generating 2D face from a single Image.
+- [x] Generating 3D face from Audio.
+- [x] Generating 4D free-view talking examples from audio and a single image.
+- [x] Gradio/Colab Demo.
+- [x] Full body/image Generation.
+- [x] integrade with stable-diffusion-web-ui. (stay tunning!)
+</details>
+- [ ] Audio-driven Anime Avatar.
+- [ ] training code of each componments.
+## If you have any problem, please view our [FAQ](docs/FAQ.md) before opening an issue.
+## ⚙️ 1. Installation.
+Tutorials from communities: [中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/) | [日本語コース](https://br-d.fanbox.cc/posts/5685086?utm_campaign=manage_post_page&utm_medium=share&utm_source=twitter)
+### Linux:
+1. Installing [anaconda](https://www.anaconda.com/), python and git.
+2. Creating the env and install the requirements.
+  ```bash
+  git clone https://github.com/Winfredy/SadTalker.git
+  cd SadTalker
+  conda create -n sadtalker python=3.8
+  conda activate sadtalker
+  pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
+  conda install ffmpeg
+  pip install -r requirements.txt
+  ### tts is optional for gradio demo.
+  ### pip install TTS
+  ```
+### Windows ([中文windows教程](https://www.bilibili.com/video/BV1Dc411W7V6/)):
+1. Install [Python 3.10.6](https://www.python.org/downloads/windows/), checking "Add Python to PATH".
+2. Install [git](https://git-scm.com/download/win) manually (OR `scoop install git` via [scoop](https://scoop.sh/)).
+3. Install `ffmpeg`, following [this instruction](https://www.wikihow.com/Install-FFmpeg-on-Windows) (OR using `scoop install ffmpeg` via [scoop](https://scoop.sh/)).
+4. Download our SadTalker repository, for example by running `git clone https://github.com/Winfredy/SadTalker.git`.
+5. Download the `checkpoint` and `gfpgan` [below↓](https://github.com/Winfredy/SadTalker#-2-download-trained-models).
+5. Run `start.bat` from Windows Explorer as normal, non-administrator, user, a gradio WebUI demo will be started.
+### Macbook:
+More tips about installnation on Macbook and the Docker file can be founded [here](docs/install.md)
+## 📥 2. Download Trained Models.
+You can run the following script to put all the models in the right place.
+```bash
+bash scripts/download_models.sh
+```
+Other alternatives:
+> we also provide an offline patch (`gfpgan/`), thus, no model will be downloaded when generating.
+**Google Driver**: download our pre-trained model from [ this link (main checkpoints)](https://drive.google.com/drive/folders/1Wd88VDoLhVzYsQ30_qDVluQr_Xm46yHT?usp=sharing) and [ gfpgan (offline patch)](https://drive.google.com/file/d/19AIBsmfcHW6BRJmeqSFlG5fL445Xmsyi?usp=sharing)
+**Github Release Page**: download all the files from the [lastest github release page](https://github.com/Winfredy/SadTalker/releases), and then, put it in ./checkpoints.
+**百度云盘**: we provided the downloaded model in [checkpoints,  提取码: sadt.](https://pan.baidu.com/s/1nXuVNd0exUl37ISwWqbFGA?pwd=sadt) And [gfpgan,  提取码: sadt.](https://pan.baidu.com/s/1kb1BCPaLOWX1JJb9Czbn6w?pwd=sadt)
+<details><summary>Model Details</summary>
+The final folder will be shown as:
+<img width="331" alt="image" src="https://user-images.githubusercontent.com/4397546/232511411-4ca75cbf-a434-48c5-9ae0-9009e8316484.png">
+Model explains:
+| Model | Description
+| :--- | :----------
+|checkpoints/auido2exp_00300-model.pth | Pre-trained ExpNet in Sadtalker.
+|checkpoints/auido2pose_00140-model.pth | Pre-trained PoseVAE in Sadtalker.
+|checkpoints/mapping_00229-model.pth.tar | Pre-trained MappingNet in Sadtalker.
+|checkpoints/mapping_00109-model.pth.tar | Pre-trained MappingNet in Sadtalker.
+|checkpoints/facevid2vid_00189-model.pth.tar | Pre-trained face-vid2vid model from [the reappearance of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis).
+|checkpoints/epoch_20.pth | Pre-trained 3DMM extractor in [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction).
+|checkpoints/wav2lip.pth | Highly accurate lip-sync model in [Wav2lip](https://github.com/Rudrabha/Wav2Lip).
+|checkpoints/shape_predictor_68_face_landmarks.dat | Face landmark model used in [dilb](http://dlib.net/).
+|checkpoints/BFM | 3DMM library file.
+|checkpoints/hub | Face detection models used in [face alignment](https://github.com/1adrianb/face-alignment).
+|gfpgan/weights | Face detection and enhanced models used in `facexlib` and `gfpgan`.
+</details>
+## 🔮 3. Quick Start ([Best Practice](docs/best_practice.md)).
+### WebUI Demos:
+**Online**: [Huggingface](https://huggingface.co/spaces/vinthony/SadTalker) | [SDWebUI-Colab](https://colab.research.google.com/github/camenduru/stable-diffusion-webui-colab/blob/main/video/stable/stable_diffusion_1_5_video_webui_colab.ipynb) | [Colab](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)
+**Local Autiomatic1111 stable-diffusion webui extension**: please refer to [Autiomatic1111 stable-diffusion webui docs](docs/webui_extension.md).
+**Local gradio demo**: Similar to our [hugging-face demo](https://huggingface.co/spaces/vinthony/SadTalker) can be run by:
+```bash
+## you need manually install TTS(https://github.com/coqui-ai/TTS) via `pip install tts` in advanced.
+python app.py
+```
+**Local windows gradio demo**: just double click `webui.bat`, the requirements will be installed automatically.
+### Manually usages:
+##### Animating a portrait image from default config:
+```bash
+python inference.py --driven_audio <audio.wav> \
+                    --source_image <video.mp4 or picture.png> \
+                    --enhancer gfpgan
+```
+The results will be saved in `results/$SOME_TIMESTAMP/*.mp4`.
+##### Full body/image Generation:
+Using `--still` to generate a natural full body video. You can add `enhancer` to improve the quality of the generated video.
+```bash
+python inference.py --driven_audio <audio.wav> \
+                    --source_image <video.mp4 or picture.png> \
+                    --result_dir <a file to store results> \
+                    --still \
+                    --preprocess full \
+                    --enhancer gfpgan
+```
+More examples and configuration and tips can be founded in the [ >>> best practice documents <<<](docs/best_practice.md).
+## 🛎 Citation
+If you find our work useful in your research, please consider citing:
+```bibtex
+@article{zhang2022sadtalker,
+  title={SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation},
+  author={Zhang, Wenxuan and Cun, Xiaodong and Wang, Xuan and Zhang, Yong and Shen, Xi and Guo, Yu and Shan, Ying and Wang, Fei},
+  journal={arXiv preprint arXiv:2211.12194},
+  year={2022}
+}
+```
+## 💗 Acknowledgements
+Facerender code borrows heavily from [zhanglonghao's reproduction of face-vid2vid](https://github.com/zhanglonghao1992/One-Shot_Free-View_Neural_Talking_Head_Synthesis) and [PIRender](https://github.com/RenYurui/PIRender). We thank the authors for sharing their wonderful code. In training process, We also use the model from [Deep3DFaceReconstruction](https://github.com/microsoft/Deep3DFaceReconstruction) and [Wav2lip](https://github.com/Rudrabha/Wav2Lip). We thank for their wonderful work.
+See also these wonderful 3rd libraries we use:
+- **Face Utils**: https://github.com/xinntao/facexlib
+- **Face Enhancement**: https://github.com/TencentARC/GFPGAN
+- **Image/Video Enhancement**:https://github.com/xinntao/Real-ESRGAN
+## 🥂 Extensions:
+- [SadTalker-Video-Lip-Sync](https://github.com/Zz-ww/SadTalker-Video-Lip-Sync) from [@Zz-ww](https://github.com/Zz-ww): SadTalker for Video Lip Editing
+## 🥂 Related Works
+- [StyleHEAT: One-Shot High-Resolution Editable Talking Face Generation via Pre-trained StyleGAN (ECCV 2022)](https://github.com/FeiiYin/StyleHEAT)
+- [CodeTalker: Speech-Driven 3D Facial Animation with Discrete Motion Prior (CVPR 2023)](https://github.com/Doubiiu/CodeTalker)
+- [VideoReTalking: Audio-based Lip Synchronization for Talking Head Video Editing In the Wild (SIGGRAPH Asia 2022)](https://github.com/vinthony/video-retalking)
+- [DPE: Disentanglement of Pose and Expression for General Video Portrait Editing (CVPR 2023)](https://github.com/Carlyx/DPE)
+- [3D GAN Inversion with Facial Symmetry Prior (CVPR 2023)](https://github.com/FeiiYin/SPI/)
+- [T2M-GPT: Generating Human Motion from Textual Descriptions with Discrete Representations (CVPR 2023)](https://github.com/Mael-zys/T2M-GPT)
+## 📢 Disclaimer
+This is not an official product of Tencent. This repository can only be used for personal/research/non-commercial purposes.
+LOGO: color and font suggestion: [ChatGPT](ai.com), logo font：[Montserrat Alternates
+](https://fonts.google.com/specimen/Montserrat+Alternates?preview.text=SadTalker&preview.text_type=custom&query=mont).
+All the copyright of the demo images and audio are from communities users or the geneartion from stable diffusion. Free free to contact us if you feel uncomfortable.

app.py ADDED Viewed

	@@ -0,0 +1,374 @@

+from flask import Flask, request, jsonify
+import torch
+import shutil
+import os
+import sys
+from argparse import ArgumentParser
+from time import strftime
+from argparse import Namespace
+from src.utils.preprocess import CropAndExtract
+from src.test_audio2coeff import Audio2Coeff
+from src.facerender.animate import AnimateFromCoeff
+from src.generate_batch import get_data
+from src.generate_facerender_batch import get_facerender_data
+# from src.utils.init_path import init_path
+import tempfile
+from openai import OpenAI
+import threading
+import elevenlabs
+from elevenlabs import set_api_key, generate, play, clone
+# from flask_cors import CORS, cross_origin
+# from flask_swagger_ui import get_swaggerui_blueprint
+import uuid
+import time
+start_time = time.time()
+class AnimationConfig:
+    def __init__(self, driven_audio_path, source_image_path, result_folder,pose_style,expression_scale,enhancer,still,preprocess,ref_pose_video_path):
+        self.driven_audio = driven_audio_path
+        self.source_image = source_image_path
+        self.ref_eyeblink = ref_pose_video_path
+        self.ref_pose = ref_pose_video_path
+        self.checkpoint_dir = './checkpoints'
+        self.result_dir = result_folder
+        self.pose_style = pose_style
+        self.batch_size = 2
+        self.expression_scale = expression_scale
+        self.input_yaw = None
+        self.input_pitch = None
+        self.input_roll = None
+        self.enhancer = enhancer
+        self.background_enhancer = None
+        self.cpu = False
+        self.face3dvis = False
+        self.still = still
+        self.preprocess = preprocess
+        self.verbose = False
+        self.old_version = False
+        self.net_recon = 'resnet50'
+        self.init_path = None
+        self.use_last_fc = False
+        self.bfm_folder = './checkpoints/BFM_Fitting/'
+        self.bfm_model = 'BFM_model_front.mat'
+        self.focal = 1015.
+        self.center = 112.
+        self.camera_d = 10.
+        self.z_near = 5.
+        self.z_far = 15.
+        self.device = 'cpu'
+app = Flask(__name__)
+TEMP_DIR = None
+app.config['temp_response'] = None
+app.config['generation_thread'] = None
+app.config['text_prompt'] = None
+app.config['final_video_path'] = None
+def main(args):
+    pic_path = args.source_image
+    audio_path = args.driven_audio
+    save_dir = args.result_dir
+    pose_style = args.pose_style
+    device = args.device
+    batch_size = args.batch_size
+    input_yaw_list = args.input_yaw
+    input_pitch_list = args.input_pitch
+    input_roll_list = args.input_roll
+    ref_eyeblink = args.ref_eyeblink
+    ref_pose = args.ref_pose
+    preprocess = args.preprocess
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    current_root_path = dir_path
+    print('current_root_path ',current_root_path)
+    # sadtalker_paths = init_path(args.checkpoint_dir, os.path.join(current_root_path, 'src/config'), args.size, args.old_version, args.preprocess)
+    path_of_lm_croper = os.path.join(current_root_path, args.checkpoint_dir, 'shape_predictor_68_face_landmarks.dat')
+    path_of_net_recon_model = os.path.join(current_root_path, args.checkpoint_dir, 'epoch_20.pth')
+    dir_of_BFM_fitting = os.path.join(current_root_path, args.checkpoint_dir, 'BFM_Fitting/BFM_Fitting')
+    wav2lip_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'wav2lip.pth')
+    audio2pose_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2pose_00140-model.pth')
+    audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
+    audio2exp_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2exp_00300-model.pth')
+    audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
+    free_view_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'facevid2vid_00189-model.pth.tar')
+    if preprocess == 'full':
+        mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00109-model.pth.tar')
+        facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
+    else:
+        mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00229-model.pth.tar')
+        facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
+    # preprocess_model = CropAndExtract(sadtalker_paths, device)
+    #init model
+    print(path_of_net_recon_model)
+    preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
+    # audio_to_coeff = Audio2Coeff(sadtalker_paths,  device)
+    audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
+                                audio2exp_checkpoint, audio2exp_yaml_path,
+                                wav2lip_checkpoint, device)
+    # animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device)
+    animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
+                                            facerender_yaml_path, device)
+    first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
+    os.makedirs(first_frame_dir, exist_ok=True)
+    # first_coeff_path, crop_pic_path, crop_info =  preprocess_model.generate(pic_path, first_frame_dir, args.preprocess,\
+                                                                            #  source_image_flag=True, pic_size=args.size)
+    first_coeff_path, crop_pic_path, crop_info =  preprocess_model.generate(pic_path, first_frame_dir, args.preprocess, source_image_flag=True)
+    print('first_coeff_path ',first_coeff_path)
+    print('crop_pic_path ',crop_pic_path)
+    if first_coeff_path is None:
+        print("Can't get the coeffs of the input")
+        return
+    if ref_eyeblink is not None:
+        ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
+        ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
+        os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
+        # ref_eyeblink_coeff_path, _, _ =  preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, source_image_flag=False)
+        ref_eyeblink_coeff_path, _, _ =  preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir)
+    else:
+        ref_eyeblink_coeff_path=None
+        print('ref_eyeblink_coeff_path',ref_eyeblink_coeff_path)
+    if ref_pose is not None:
+        if ref_pose == ref_eyeblink:
+            ref_pose_coeff_path = ref_eyeblink_coeff_path
+        else:
+            ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
+            ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
+            os.makedirs(ref_pose_frame_dir, exist_ok=True)
+            # ref_pose_coeff_path, _, _ =  preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, source_image_flag=False)
+            ref_pose_coeff_path, _, _ =  preprocess_model.generate(ref_pose, ref_pose_frame_dir)
+    else:
+        ref_pose_coeff_path=None
+        print('ref_eyeblink_coeff_path',ref_pose_coeff_path)
+    batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
+    coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
+    if args.face3dvis:
+        from src.face3d.visualize import gen_composed_video
+        gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
+    # data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
+                                # batch_size, input_yaw_list, input_pitch_list, input_roll_list,
+                                # expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size)
+    data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
+                                batch_size, input_yaw_list, input_pitch_list, input_roll_list,
+                                expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
+    # result, base64_video,temp_file_path= animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
+                                # enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess, img_size=args.size)
+    result, base64_video,temp_file_path = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
+                                enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
+    print('The generated video is named:')
+    app.config['temp_response'] = base64_video
+    app.config['final_video_path'] = temp_file_path
+    return base64_video, temp_file_path
+    # shutil.move(result, save_dir+'.mp4')
+    if not args.verbose:
+        shutil.rmtree(save_dir)
+def create_temp_dir():
+    return tempfile.TemporaryDirectory()
+def save_uploaded_file(file, filename,TEMP_DIR):
+    unique_filename = str(uuid.uuid4()) + "_" + filename
+    file_path = os.path.join(TEMP_DIR.name, unique_filename)
+    file.save(file_path)
+    return file_path
+client = OpenAI(api_key="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")
+def translate_text(text_prompt, target_language):
+    response = client.chat.completions.create(
+        model="gpt-4-0125-preview",
+        messages=[{"role": "system", "content": "You are a helpful language translator assistant."},
+            {"role": "user", "content": f"Translate completely without hallucination, end to end, and give the following text to {target_language} language and the text is: {text_prompt}"},
+        ],
+        max_tokens = len(text_prompt) + 200 # Use the length of the input text
+        # temperature=0.3,
+        # stop=["Translate:", "Text:"]
+    )
+    return response
+@app.route("/run", methods=['POST'])
+async def generate_video():
+    global TEMP_DIR
+    TEMP_DIR = create_temp_dir()
+    if request.method == 'POST':
+        source_image = request.files['source_image']
+        text_prompt = request.form['text_prompt']
+        print('Input text prompt: ',text_prompt)
+        voice_cloning = request.form.get('voice_cloning', 'no')
+        target_language = request.form.get('target_language', 'original_text')
+        print('target_language',target_language)
+        pose_style = int(request.form.get('pose_style', 1))
+        expression_scale = int(request.form.get('expression_scale', 1))
+        enhancer = request.form.get('enhancer', None)
+        voice_gender = request.form.get('voice_gender', 'male')
+        still_str = request.form.get('still', 'False')
+        still = still_str.lower() == 'true'
+        print('still', still)
+        preprocess = request.form.get('preprocess', 'crop')
+        print('preprocess selected: ',preprocess)
+        ref_pose_video = request.files.get('ref_pose', None)
+        if target_language != 'original_text':
+            response = translate_text(text_prompt, target_language)
+            # response = await translate_text_async(text_prompt, target_language)
+            text_prompt = response.choices[0].message.content.strip()
+        app.config['text_prompt'] = text_prompt
+        print('Final text prompt: ',text_prompt)
+        source_image_path = save_uploaded_file(source_image, 'source_image.png',TEMP_DIR)
+        print(source_image_path)
+        # driven_audio_path = await voice_cloning_async(voice_cloning, voice_gender, text_prompt, user_voice)
+        if voice_cloning == 'no':
+            if voice_gender == 'male':
+                voice = 'onyx'
+            else:
+                voice = 'nova'
+            print('Entering Audio creation using whisper')
+            response = client.audio.speech.create(model="tts-1-hd",
+                                                voice=voice,
+                                                input = text_prompt)
+            print('Audio created using whisper')
+            with tempfile.NamedTemporaryFile(suffix=".wav", prefix="text_to_speech_",dir=TEMP_DIR.name, delete=False) as temp_file:
+                driven_audio_path = temp_file.name
+            response.write_to_file(driven_audio_path)
+            print('Audio file saved')
+        elif voice_cloning == 'yes':
+            user_voice = request.files['user_voice']
+            with tempfile.NamedTemporaryFile(suffix=".wav", prefix="user_voice_",dir=TEMP_DIR.name, delete=False) as temp_file:
+                user_voice_path = temp_file.name
+                user_voice.save(user_voice_path)
+                print('user_voice_path',user_voice_path)
+            set_api_key("87792fce164425fbe1204e9fd1fe25cd")
+            voice = clone(name = "User Cloned Voice",
+                        files = [user_voice_path] )
+            audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
+            with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
+                for chunk in audio:
+                    temp_file.write(chunk)
+                driven_audio_path = temp_file.name
+                print('driven_audio_path',driven_audio_path)
+            #     elevenlabs.save(audio, driven_audio_path)
+        save_dir = tempfile.mkdtemp(dir=TEMP_DIR.name)
+        result_folder = os.path.join(save_dir, "results")
+        os.makedirs(result_folder, exist_ok=True)
+        ref_pose_video_path = None
+        if ref_pose_video:
+            with tempfile.NamedTemporaryFile(suffix=".mp4", prefix="ref_pose_",dir=TEMP_DIR.name, delete=False) as temp_file:
+                ref_pose_video_path = temp_file.name
+                ref_pose_video.save(ref_pose_video_path)
+                print('ref_pose_video_path',ref_pose_video_path)
+    # Example of using the class with some hypothetical paths
+    args = AnimationConfig(driven_audio_path=driven_audio_path, source_image_path=source_image_path, result_folder=result_folder, pose_style=pose_style, expression_scale=expression_scale, enhancer=enhancer,still=still,preprocess=preprocess,ref_pose_video_path=ref_pose_video_path)
+    if torch.cuda.is_available() and not args.cpu:
+        args.device = "cuda"
+    else:
+        args.device = "cpu"
+    generation_thread = threading.Thread(target=main, args=(args,))
+    app.config['generation_thread'] = generation_thread
+    generation_thread.start()
+    response_data = {"message": "Video generation started",
+                    "process_id": generation_thread.ident}
+    return jsonify(response_data)
+    # base64_video = main(args)
+    # return jsonify({"base64_video": base64_video})
+    #else:
+    #    return 'Unsupported HTTP method', 405
+@app.route("/status", methods=["GET"])
+def check_generation_status():
+    global TEMP_DIR
+    response = {"base64_video": "","text_prompt":"", "status": ""}
+    process_id = request.args.get('process_id', None)
+    # process_id is required to check the status for that specific process
+    if process_id:
+        generation_thread = app.config.get('generation_thread')
+        if generation_thread and generation_thread.ident == int(process_id) and generation_thread.is_alive():
+            return jsonify({"status": "in_progress"}), 200
+        elif app.config.get('temp_response'):
+            # app.config['temp_response']['status'] = 'completed'
+            final_response = app.config['temp_response']
+            response["base64_video"] = final_response
+            response["text_prompt"] = app.config.get('text_prompt')
+            response["status"] = "completed"
+            final_video_path = app.config['final_video_path']
+            print('final_video_path',final_video_path)
+            if final_video_path and os.path.exists(final_video_path):
+                os.remove(final_video_path)
+                print("Deleted video file:", final_video_path)
+            TEMP_DIR.cleanup()
+            # print("Temporary Directory:", TEMP_DIR.name)
+            # if TEMP_DIR:
+            #     print("Contents of Temporary Directory:")
+            #     for filename in os.listdir(TEMP_DIR.name):
+            #         print(filename)
+            # else:
+            #     print("Temporary Directory is None or already cleaned up.")
+            end_time = time.time()
+            total_time = round(end_time - start_time, 2)
+            print("Total time taken for execution:", total_time, " seconds")
+            return jsonify(response)
+    return jsonify({"error":"No process id provided"})
+@app.route("/health", methods=["GET"])
+def health_status():
+    response = {"online": "true"}
+    return jsonify(response)
+if __name__ == '__main__':
+    app.run(debug=True)

cog.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+build:
+  gpu: true
+  cuda: "11.3"
+  python_version: "3.8"
+  system_packages:
+    - "ffmpeg"
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+  python_packages:
+    - "torch==1.12.1"
+    - "torchvision==0.13.1"
+    - "torchaudio==0.12.1"
+    - "joblib==1.1.0"
+    - "scikit-image==0.19.3"
+    - "basicsr==1.4.2"
+    - "facexlib==0.3.0"
+    - "resampy==0.3.1"
+    - "pydub==0.25.1"
+    - "scipy==1.10.1"
+    - "kornia==0.6.8"
+    - "face_alignment==1.3.5"
+    - "imageio==2.19.3"
+    - "imageio-ffmpeg==0.4.7"
+    - "librosa==0.9.2" #
+    - "tqdm==4.65.0"
+    - "yacs==0.1.8"
+    - "gfpgan==1.3.8"
+    - "dlib-bin==19.24.1"
+    - "av==10.0.0"
+    - "trimesh==3.9.20"
+  run:
+    - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/s3fd-619a316812.pth" "https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth"
+    - mkdir -p /root/.cache/torch/hub/checkpoints/ && wget --output-document "/root/.cache/torch/hub/checkpoints/2DFAN4-cd938726ad.zip" "https://www.adrianbulat.com/downloads/python-fan/2DFAN4-cd938726ad.zip"
+predict: "predict.py:Predictor"

inference.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import torch
+from time import  strftime
+import os, sys, time
+from argparse import ArgumentParser
+from src.utils.preprocess import CropAndExtract
+from src.test_audio2coeff import Audio2Coeff
+from src.facerender.animate import AnimateFromCoeff
+from src.generate_batch import get_data
+from src.generate_facerender_batch import get_facerender_data
+def main(args):
+    #torch.backends.cudnn.enabled = False
+    pic_path = args.source_image
+    audio_path = args.driven_audio
+    save_dir = os.path.join(args.result_dir, strftime("%Y_%m_%d_%H.%M.%S"))
+    os.makedirs(save_dir, exist_ok=True)
+    pose_style = args.pose_style
+    device = args.device
+    batch_size = args.batch_size
+    input_yaw_list = args.input_yaw
+    input_pitch_list = args.input_pitch
+    input_roll_list = args.input_roll
+    ref_eyeblink = args.ref_eyeblink
+    ref_pose = args.ref_pose
+    current_code_path = sys.argv[0]
+    current_root_path = os.path.split(current_code_path)[0]
+    os.environ['TORCH_HOME']=os.path.join(current_root_path, args.checkpoint_dir)
+    path_of_lm_croper = os.path.join(current_root_path, args.checkpoint_dir, 'shape_predictor_68_face_landmarks.dat')
+    path_of_net_recon_model = os.path.join(current_root_path, args.checkpoint_dir, 'epoch_20.pth')
+    dir_of_BFM_fitting = os.path.join(current_root_path, args.checkpoint_dir, 'BFM_Fitting')
+    wav2lip_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'wav2lip.pth')
+    audio2pose_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2pose_00140-model.pth')
+    audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
+    audio2exp_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2exp_00300-model.pth')
+    audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
+    free_view_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'facevid2vid_00189-model.pth.tar')
+    if args.preprocess == 'full':
+        mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00109-model.pth.tar')
+        facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
+    else:
+        mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00229-model.pth.tar')
+        facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
+    #init model
+    print(path_of_net_recon_model)
+    preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
+    print(audio2pose_checkpoint)
+    print(audio2exp_checkpoint)
+    audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
+                                audio2exp_checkpoint, audio2exp_yaml_path,
+                                wav2lip_checkpoint, device)
+    print(free_view_checkpoint)
+    print(mapping_checkpoint)
+    animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
+                                            facerender_yaml_path, device)
+    #crop image and extract 3dmm from image
+    first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
+    os.makedirs(first_frame_dir, exist_ok=True)
+    print('3DMM Extraction for source image')
+    first_coeff_path, crop_pic_path, crop_info =  preprocess_model.generate(pic_path, first_frame_dir, args.preprocess, source_image_flag=True)
+    if first_coeff_path is None:
+        print("Can't get the coeffs of the input")
+        return
+    if ref_eyeblink is not None:
+        ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
+        ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
+        os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
+        print('3DMM Extraction for the reference video providing eye blinking')
+        ref_eyeblink_coeff_path, _, _ =  preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir)
+    else:
+        ref_eyeblink_coeff_path=None
+    if ref_pose is not None:
+        if ref_pose == ref_eyeblink:
+            ref_pose_coeff_path = ref_eyeblink_coeff_path
+        else:
+            ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
+            ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
+            os.makedirs(ref_pose_frame_dir, exist_ok=True)
+            print('3DMM Extraction for the reference video providing pose')
+            ref_pose_coeff_path, _, _ =  preprocess_model.generate(ref_pose, ref_pose_frame_dir)
+    else:
+        ref_pose_coeff_path=None
+    #audio2ceoff
+    batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
+    coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
+    # 3dface render
+    if args.face3dvis:
+        from src.face3d.visualize import gen_composed_video
+        gen_composed_video(args, device, first_coeff_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
+    #coeff2video
+    data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
+                                batch_size, input_yaw_list, input_pitch_list, input_roll_list,
+                                expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
+    animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
+                                enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
+if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument("--driven_audio", default='./examples/driven_audio/bus_chinese.wav', help="path to driven audio")
+    parser.add_argument("--source_image", default='./examples/source_image/full_body_2.png', help="path to source image")
+    parser.add_argument("--ref_eyeblink", default=None, help="path to reference video providing eye blinking")
+    parser.add_argument("--ref_pose", default=None, help="path to reference video providing pose")
+    parser.add_argument("--checkpoint_dir", default='./checkpoints', help="path to output")
+    parser.add_argument("--result_dir", default='./results', help="path to output")
+    parser.add_argument("--pose_style", type=int, default=0,  help="input pose style from [0, 46)")
+    parser.add_argument("--batch_size", type=int, default=2,  help="the batch size of facerender")
+    parser.add_argument("--expression_scale", type=float, default=1.,  help="the batch size of facerender")
+    parser.add_argument('--input_yaw', nargs='+', type=int, default=None, help="the input yaw degree of the user ")
+    parser.add_argument('--input_pitch', nargs='+', type=int, default=None, help="the input pitch degree of the user")
+    parser.add_argument('--input_roll', nargs='+', type=int, default=None, help="the input roll degree of the user")
+    parser.add_argument('--enhancer',  type=str, default=None, help="Face enhancer, [gfpgan, RestoreFormer]")
+    parser.add_argument('--background_enhancer',  type=str, default=None, help="background enhancer, [realesrgan]")
+    parser.add_argument("--cpu", dest="cpu", action="store_true")
+    parser.add_argument("--face3dvis", action="store_true", help="generate 3d face and 3d landmarks")
+    parser.add_argument("--still", action="store_true", help="can crop back to the original videos for the full body aniamtion")
+    parser.add_argument("--preprocess", default='crop', choices=['crop', 'resize', 'full'], help="how to preprocess the images" )
+    # net structure and parameters
+    parser.add_argument('--net_recon', type=str, default='resnet50', choices=['resnet18', 'resnet34', 'resnet50'], help='useless')
+    parser.add_argument('--init_path', type=str, default=None, help='Useless')
+    parser.add_argument('--use_last_fc',default=False, help='zero initialize the last fc')
+    parser.add_argument('--bfm_folder', type=str, default='./checkpoints/BFM_Fitting/')
+    parser.add_argument('--bfm_model', type=str, default='BFM_model_front.mat', help='bfm model')
+    # default renderer parameters
+    parser.add_argument('--focal', type=float, default=1015.)
+    parser.add_argument('--center', type=float, default=112.)
+    parser.add_argument('--camera_d', type=float, default=10.)
+    parser.add_argument('--z_near', type=float, default=5.)
+    parser.add_argument('--z_far', type=float, default=15.)
+    args = parser.parse_args()
+    if torch.cuda.is_available() and not args.cpu:
+        args.device = "cuda"
+    else:
+        args.device = "cpu"
+    main(args)

launcher.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# this scripts installs necessary requirements and launches main program in webui.py
+# borrow from : https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/master/launch.py
+import subprocess
+import os
+import sys
+import importlib.util
+import shlex
+import platform
+import json
+python = sys.executable
+git = os.environ.get('GIT', "git")
+index_url = os.environ.get('INDEX_URL', "")
+stored_commit_hash = None
+skip_install = False
+dir_repos = "repositories"
+script_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+if 'GRADIO_ANALYTICS_ENABLED' not in os.environ:
+    os.environ['GRADIO_ANALYTICS_ENABLED'] = 'False'
+def check_python_version():
+    is_windows = platform.system() == "Windows"
+    major = sys.version_info.major
+    minor = sys.version_info.minor
+    micro = sys.version_info.micro
+    if is_windows:
+        supported_minors = [10]
+    else:
+        supported_minors = [7, 8, 9, 10, 11]
+    if not (major == 3 and minor in supported_minors):
+        raise (f"""
+INCOMPATIBLE PYTHON VERSION
+This program is tested with 3.10.6 Python, but you have {major}.{minor}.{micro}.
+If you encounter an error with "RuntimeError: Couldn't install torch." message,
+or any other error regarding unsuccessful package (library) installation,
+please downgrade (or upgrade) to the latest version of 3.10 Python
+and delete current Python and "venv" folder in WebUI's directory.
+You can download 3.10 Python from here: https://www.python.org/downloads/release/python-3109/
+{"Alternatively, use a binary release of WebUI: https://github.com/AUTOMATIC1111/stable-diffusion-webui/releases" if is_windows else ""}
+Use --skip-python-version-check to suppress this warning.
+""")
+def commit_hash():
+    global stored_commit_hash
+    if stored_commit_hash is not None:
+        return stored_commit_hash
+    try:
+        stored_commit_hash = run(f"{git} rev-parse HEAD").strip()
+    except Exception:
+        stored_commit_hash = "<none>"
+    return stored_commit_hash
+def run(command, desc=None, errdesc=None, custom_env=None, live=False):
+    if desc is not None:
+        print(desc)
+    if live:
+        result = subprocess.run(command, shell=True, env=os.environ if custom_env is None else custom_env)
+        if result.returncode != 0:
+            raise RuntimeError(f"""{errdesc or 'Error running command'}.
+Command: {command}
+Error code: {result.returncode}""")
+        return ""
+    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, env=os.environ if custom_env is None else custom_env)
+    if result.returncode != 0:
+        message = f"""{errdesc or 'Error running command'}.
+Command: {command}
+Error code: {result.returncode}
+stdout: {result.stdout.decode(encoding="utf8", errors="ignore") if len(result.stdout)>0 else '<empty>'}
+stderr: {result.stderr.decode(encoding="utf8", errors="ignore") if len(result.stderr)>0 else '<empty>'}
+"""
+        raise RuntimeError(message)
+    return result.stdout.decode(encoding="utf8", errors="ignore")
+def check_run(command):
+    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    return result.returncode == 0
+def is_installed(package):
+    try:
+        spec = importlib.util.find_spec(package)
+    except ModuleNotFoundError:
+        return False
+    return spec is not None
+def repo_dir(name):
+    return os.path.join(script_path, dir_repos, name)
+def run_python(code, desc=None, errdesc=None):
+    return run(f'"{python}" -c "{code}"', desc, errdesc)
+def run_pip(args, desc=None):
+    if skip_install:
+        return
+    index_url_line = f' --index-url {index_url}' if index_url != '' else ''
+    return run(f'"{python}" -m pip {args} --prefer-binary{index_url_line}', desc=f"Installing {desc}", errdesc=f"Couldn't install {desc}")
+def check_run_python(code):
+    return check_run(f'"{python}" -c "{code}"')
+def git_clone(url, dir, name, commithash=None):
+    # TODO clone into temporary dir and move if successful
+    if os.path.exists(dir):
+        if commithash is None:
+            return
+        current_hash = run(f'"{git}" -C "{dir}" rev-parse HEAD', None, f"Couldn't determine {name}'s hash: {commithash}").strip()
+        if current_hash == commithash:
+            return
+        run(f'"{git}" -C "{dir}" fetch', f"Fetching updates for {name}...", f"Couldn't fetch {name}")
+        run(f'"{git}" -C "{dir}" checkout {commithash}', f"Checking out commit for {name} with hash: {commithash}...", f"Couldn't checkout commit {commithash} for {name}")
+        return
+    run(f'"{git}" clone "{url}" "{dir}"', f"Cloning {name} into {dir}...", f"Couldn't clone {name}")
+    if commithash is not None:
+        run(f'"{git}" -C "{dir}" checkout {commithash}', None, "Couldn't checkout {name}'s hash: {commithash}")
+def git_pull_recursive(dir):
+    for subdir, _, _ in os.walk(dir):
+        if os.path.exists(os.path.join(subdir, '.git')):
+            try:
+                output = subprocess.check_output([git, '-C', subdir, 'pull', '--autostash'])
+                print(f"Pulled changes for repository in '{subdir}':\n{output.decode('utf-8').strip()}\n")
+            except subprocess.CalledProcessError as e:
+                print(f"Couldn't perform 'git pull' on repository in '{subdir}':\n{e.output.decode('utf-8').strip()}\n")
+def run_extension_installer(extension_dir):
+    path_installer = os.path.join(extension_dir, "install.py")
+    if not os.path.isfile(path_installer):
+        return
+    try:
+        env = os.environ.copy()
+        env['PYTHONPATH'] = os.path.abspath(".")
+        print(run(f'"{python}" "{path_installer}"', errdesc=f"Error running install.py for extension {extension_dir}", custom_env=env))
+    except Exception as e:
+        print(e, file=sys.stderr)
+def prepare_environment():
+    global skip_install
+    torch_command = os.environ.get('TORCH_COMMAND', "pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117")
+    requirements_file = os.environ.get('REQS_FILE', "requirements.txt")
+    commit = commit_hash()
+    print(f"Python {sys.version}")
+    print(f"Commit hash: {commit}")
+    if not is_installed("torch") or not is_installed("torchvision"):
+        run(f'"{python}" -m {torch_command}', "Installing torch and torchvision", "Couldn't install torch", live=True)
+    run_python("import torch; assert torch.cuda.is_available(), 'Torch is not able to use GPU; add --skip-torch-cuda-test to COMMANDLINE_ARGS variable to disable this check'")
+    run_pip(f"install -r \"{requirements_file}\"", "requirements for SadTalker WebUI (may take longer time in first time)")
+def start():
+    print(f"Launching SadTalker Web UI")
+    from app import sadtalker_demo
+    demo = sadtalker_demo()
+    demo.launch(share=True)
+if __name__ == "__main__":
+    prepare_environment()
+    start()

predict.py ADDED Viewed

	@@ -0,0 +1,214 @@

+"""run bash scripts/download_models.sh first to prepare the weights file"""
+import os
+import shutil
+from argparse import Namespace
+from src.utils.preprocess import CropAndExtract
+from src.test_audio2coeff import Audio2Coeff
+from src.facerender.animate import AnimateFromCoeff
+from src.generate_batch import get_data
+from src.generate_facerender_batch import get_facerender_data
+from cog import BasePredictor, Input, Path
+checkpoints = "checkpoints"
+class Predictor(BasePredictor):
+    def setup(self):
+        """Load the model into memory to make running multiple predictions efficient"""
+        device = "cuda"
+        path_of_lm_croper = os.path.join(
+            checkpoints, "shape_predictor_68_face_landmarks.dat"
+        )
+        path_of_net_recon_model = os.path.join(checkpoints, "epoch_20.pth")
+        dir_of_BFM_fitting = os.path.join(checkpoints, "BFM_Fitting")
+        wav2lip_checkpoint = os.path.join(checkpoints, "wav2lip.pth")
+        audio2pose_checkpoint = os.path.join(checkpoints, "auido2pose_00140-model.pth")
+        audio2pose_yaml_path = os.path.join("src", "config", "auido2pose.yaml")
+        audio2exp_checkpoint = os.path.join(checkpoints, "auido2exp_00300-model.pth")
+        audio2exp_yaml_path = os.path.join("src", "config", "auido2exp.yaml")
+        free_view_checkpoint = os.path.join(
+            checkpoints, "facevid2vid_00189-model.pth.tar"
+        )
+        # init model
+        self.preprocess_model = CropAndExtract(
+            path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device
+        )
+        self.audio_to_coeff = Audio2Coeff(
+            audio2pose_checkpoint,
+            audio2pose_yaml_path,
+            audio2exp_checkpoint,
+            audio2exp_yaml_path,
+            wav2lip_checkpoint,
+            device,
+        )
+        self.animate_from_coeff = {
+            "full": AnimateFromCoeff(
+                free_view_checkpoint,
+                os.path.join(checkpoints, "mapping_00109-model.pth.tar"),
+                os.path.join("src", "config", "facerender_still.yaml"),
+                device,
+            ),
+            "others": AnimateFromCoeff(
+                free_view_checkpoint,
+                os.path.join(checkpoints, "mapping_00229-model.pth.tar"),
+                os.path.join("src", "config", "facerender.yaml"),
+                device,
+            ),
+        }
+    def predict(
+        self,
+        source_image: Path = Input(
+            description="Upload the source image, it can be video.mp4 or picture.png",
+        ),
+        driven_audio: Path = Input(
+            description="Upload the driven audio, accepts .wav and .mp4 file",
+        ),
+        enhancer: str = Input(
+            description="Choose a face enhancer",
+            choices=["gfpgan", "RestoreFormer"],
+            default="gfpgan",
+        ),
+        preprocess: str = Input(
+            description="how to preprocess the images",
+            choices=["crop", "resize", "full"],
+            default="full",
+        ),
+        ref_eyeblink: Path = Input(
+            description="path to reference video providing eye blinking",
+            default=None,
+        ),
+        ref_pose: Path = Input(
+            description="path to reference video providing pose",
+            default=None,
+        ),
+        still: bool = Input(
+            description="can crop back to the original videos for the full body aniamtion when preprocess is full",
+            default=True,
+        ),
+    ) -> Path:
+        """Run a single prediction on the model"""
+        animate_from_coeff = (
+            self.animate_from_coeff["full"]
+            if preprocess == "full"
+            else self.animate_from_coeff["others"]
+        )
+        args = load_default()
+        args.pic_path = str(source_image)
+        args.audio_path = str(driven_audio)
+        device = "cuda"
+        args.still = still
+        args.ref_eyeblink = None if ref_eyeblink is None else str(ref_eyeblink)
+        args.ref_pose = None if ref_pose is None else str(ref_pose)
+        # crop image and extract 3dmm from image
+        results_dir = "results"
+        if os.path.exists(results_dir):
+            shutil.rmtree(results_dir)
+        os.makedirs(results_dir)
+        first_frame_dir = os.path.join(results_dir, "first_frame_dir")
+        os.makedirs(first_frame_dir)
+        print("3DMM Extraction for source image")
+        first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(
+            args.pic_path, first_frame_dir, preprocess, source_image_flag=True
+        )
+        if first_coeff_path is None:
+            print("Can't get the coeffs of the input")
+            return
+        if ref_eyeblink is not None:
+            ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[
+                0
+            ]
+            ref_eyeblink_frame_dir = os.path.join(results_dir, ref_eyeblink_videoname)
+            os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
+            print("3DMM Extraction for the reference video providing eye blinking")
+            ref_eyeblink_coeff_path, _, _ = self.preprocess_model.generate(
+                ref_eyeblink, ref_eyeblink_frame_dir
+            )
+        else:
+            ref_eyeblink_coeff_path = None
+        if ref_pose is not None:
+            if ref_pose == ref_eyeblink:
+                ref_pose_coeff_path = ref_eyeblink_coeff_path
+            else:
+                ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
+                ref_pose_frame_dir = os.path.join(results_dir, ref_pose_videoname)
+                os.makedirs(ref_pose_frame_dir, exist_ok=True)
+                print("3DMM Extraction for the reference video providing pose")
+                ref_pose_coeff_path, _, _ = self.preprocess_model.generate(
+                    ref_pose, ref_pose_frame_dir
+                )
+        else:
+            ref_pose_coeff_path = None
+        # audio2ceoff
+        batch = get_data(
+            first_coeff_path,
+            args.audio_path,
+            device,
+            ref_eyeblink_coeff_path,
+            still=still,
+        )
+        coeff_path = self.audio_to_coeff.generate(
+            batch, results_dir, args.pose_style, ref_pose_coeff_path
+        )
+        # coeff2video
+        print("coeff2video")
+        data = get_facerender_data(
+            coeff_path,
+            crop_pic_path,
+            first_coeff_path,
+            args.audio_path,
+            args.batch_size,
+            args.input_yaw,
+            args.input_pitch,
+            args.input_roll,
+            expression_scale=args.expression_scale,
+            still_mode=still,
+            preprocess=preprocess,
+        )
+        animate_from_coeff.generate(
+            data, results_dir, args.pic_path, crop_info,
+            enhancer=enhancer, background_enhancer=args.background_enhancer,
+            preprocess=preprocess)
+        output = "/tmp/out.mp4"
+        mp4_path = os.path.join(results_dir, [f for f in os.listdir(results_dir) if "enhanced.mp4" in f][0])
+        shutil.copy(mp4_path, output)
+        return Path(output)
+def load_default():
+    return Namespace(
+        pose_style=0,
+        batch_size=2,
+        expression_scale=1.0,
+        input_yaw=None,
+        input_pitch=None,
+        input_roll=None,
+        background_enhancer=None,
+        face3dvis=False,
+        net_recon="resnet50",
+        init_path=None,
+        use_last_fc=False,
+        bfm_folder="./checkpoints/BFM_Fitting/",
+        bfm_model="BFM_model_front.mat",
+        focal=1015.0,
+        center=112.0,
+        camera_d=10.0,
+        z_near=5.0,
+        z_far=15.0,
+    )

quick_demo.ipynb ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "M74Gs_TjYl_B"
+      },
+      "source": [
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Winfredy/SadTalker/blob/main/quick_demo.ipynb)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github"
+      },
+      "source": [
+        "### SadTalker：Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation \n",
+        "\n",
+        "[arxiv](https://arxiv.org/abs/2211.12194) | [project](https://sadtalker.github.io) | [Github](https://github.com/Winfredy/SadTalker)\n",
+        "\n",
+        "Wenxuan Zhang, Xiaodong Cun, Xuan Wang, Yong Zhang, Xi Shen, Yu Guo, Ying Shan, Fei Wang.\n",
+        "\n",
+        "Xi'an Jiaotong University, Tencent AI Lab, Ant Group\n",
+        "\n",
+        "CVPR 2023\n",
+        "\n",
+        "TL;DR: A realistic and stylized talking head video generation method from a single image and audio\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kA89DV-sKS4i"
+      },
+      "source": [
+        "Installation (around 5 mins)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qJ4CplXsYl_E"
+      },
+      "outputs": [],
+      "source": [
+        "### make sure that CUDA is available in Edit -> Nootbook settings -> GPU\n",
+        "!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Mdq6j4E5KQAR"
+      },
+      "outputs": [],
+      "source": [
+        "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.8 2  \n",
+        "!update-alternatives --install /usr/local/bin/python3 python3 /usr/bin/python3.9 1  \n",
+        "!python --version  \n",
+        "!apt-get update\n",
+        "!apt install software-properties-common\n",
+        "!sudo dpkg --remove --force-remove-reinstreq python3-pip python3-setuptools python3-wheel\n",
+        "!apt-get install python3-pip\n",
+        "\n",
+        "print('Git clone project and install requirements...')\n",
+        "!git clone https://github.com/Winfredy/SadTalker &> /dev/null\n",
+        "%cd SadTalker \n",
+        "!export PYTHONPATH=/content/SadTalker:$PYTHONPATH \n",
+        "!python3.8 -m pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113\n",
+        "!apt update\n",
+        "!apt install ffmpeg &> /dev/null  \n",
+        "!python3.8 -m pip install -r requirements.txt"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DddcKB_nKsnk"
+      },
+      "source": [
+        "Download models (1 mins)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "eDw3_UN8K2xa"
+      },
+      "outputs": [],
+      "source": [
+        "print('Download pre-trained models...')\n",
+        "!rm -rf checkpoints\n",
+        "!bash scripts/download_models.sh"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kK7DYeo7Yl_H"
+      },
+      "outputs": [],
+      "source": [
+        "# borrow from makeittalk\n",
+        "import ipywidgets as widgets\n",
+        "import glob\n",
+        "import matplotlib.pyplot as plt\n",
+        "print(\"Choose the image name to animate: (saved in folder 'examples/')\")\n",
+        "img_list = glob.glob1('examples/source_image', '*.png')\n",
+        "img_list.sort()\n",
+        "img_list = [item.split('.')[0] for item in img_list]\n",
+        "default_head_name = widgets.Dropdown(options=img_list, value='full3')\n",
+        "def on_change(change):\n",
+        "    if change['type'] == 'change' and change['name'] == 'value':\n",
+        "        plt.imshow(plt.imread('examples/source_image/{}.png'.format(default_head_name.value)))\n",
+        "        plt.axis('off')\n",
+        "        plt.show()\n",
+        "default_head_name.observe(on_change)\n",
+        "display(default_head_name)\n",
+        "plt.imshow(plt.imread('examples/source_image/{}.png'.format(default_head_name.value)))\n",
+        "plt.axis('off')\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-khNZcnGK4UK"
+      },
+      "source": [
+        "Animation"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ToBlDusjK5sS"
+      },
+      "outputs": [],
+      "source": [
+        "# selected audio from exmaple/driven_audio\n",
+        "img = 'examples/source_image/{}.png'.format(default_head_name.value)\n",
+        "print(img)\n",
+        "!python3.8 inference.py --driven_audio ./examples/driven_audio/RD_Radio31_000.wav \\\n",
+        "           --source_image {img} \\\n",
+        "           --result_dir ./results --still --preprocess full --enhancer gfpgan"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fAjwGmKKYl_I"
+      },
+      "outputs": [],
+      "source": [
+        "# visualize code from makeittalk\n",
+        "from IPython.display import HTML\n",
+        "from base64 import b64encode\n",
+        "import os, sys\n",
+        "\n",
+        "# get the last from results\n",
+        "\n",
+        "results = sorted(os.listdir('./results/'))\n",
+        "\n",
+        "mp4_name = glob.glob('./results/'+results[-1]+'/*.mp4')[0]\n",
+        "\n",
+        "mp4 = open('{}'.format(mp4_name),'rb').read()\n",
+        "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "\n",
+        "print('Display animation: {}'.format(mp4_name), file=sys.stderr)\n",
+        "display(HTML(\"\"\"\n",
+        "  <video width=256 controls>\n",
+        "        <source src=\"%s\" type=\"video/mp4\">\n",
+        "  </video>\n",
+        "  \"\"\" % data_url))\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "base",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.9.7"
+    },
+    "vscode": {
+      "interpreter": {
+        "hash": "db5031b3636a3f037ea48eb287fd3d023feb9033aefc2a9652a92e470fb0851b"
+      }
+    },
+    "accelerator": "GPU",
+    "gpuClass": "standard"
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

requirements3d.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+numpy==1.23.4
+face_alignment==1.3.5
+imageio==2.19.3
+imageio-ffmpeg==0.4.7
+librosa==0.9.2 #
+numba
+resampy==0.3.1
+pydub==0.25.1
+scipy==1.5.3
+kornia==0.6.8
+tqdm
+yacs==0.1.8
+pyyaml
+joblib==1.1.0
+scikit-image==0.19.3
+basicsr==1.4.2
+facexlib==0.2.5
+trimesh==3.9.20
+dlib-bin
+gradio
+gfpgan

webui.bat ADDED Viewed

	@@ -0,0 +1,17 @@

+@echo off
+IF NOT EXIST venv (
+python -m venv venv
+) ELSE (
+echo venv folder already exists, skipping creation...
+)
+call .\venv\Scripts\activate.bat
+set PYTHON="venv\Scripts\Python.exe"
+echo venv %PYTHON%
+%PYTHON% Launcher.py
+echo.
+echo Launch unsuccessful. Exiting.
+pause