[tds_menu_login inline="yes" guest_tdicon="td-icon-profile" logout_tdicon="td-icon-log-out" tdc_css="eyJwaG9uZSI6eyJtYXJnaW4tcmlnaHQiOiIyMCIsIm1hcmdpbi1ib3R0b20iOiIwIiwibWFyZ2luLWxlZnQiOiI2IiwiZGlzcGxheSI6IiJ9LCJwaG9uZV9tYXhfd2lkdGgiOjc2N30=" toggle_hide="eyJwaG9uZSI6InllcyJ9" ia_space="eyJwaG9uZSI6IjAifQ==" icon_size="eyJhbGwiOjI0LCJwaG9uZSI6IjIwIn0=" avatar_size="eyJwaG9uZSI6IjIwIn0=" show_menu="yes" menu_offset_top="eyJwaG9uZSI6IjE4In0=" menu_offset_horiz="eyJhbGwiOjgsInBob25lIjoiLTMifQ==" menu_width="eyJwaG9uZSI6IjE4MCJ9" menu_horiz_align="eyJhbGwiOiJjb250ZW50LWhvcml6LWxlZnQiLCJwaG9uZSI6ImNvbnRlbnQtaG9yaXotcmlnaHQifQ==" menu_uh_padd="eyJwaG9uZSI6IjEwcHggMTVweCA4cHgifQ==" menu_gh_padd="eyJwaG9uZSI6IjEwcHggMTVweCA4cHgifQ==" menu_ul_padd="eyJwaG9uZSI6IjhweCAxNXB4In0=" menu_ul_space="eyJwaG9uZSI6IjYifQ==" menu_ulo_padd="eyJwaG9uZSI6IjhweCAxNXB4IDEwcHgifQ==" menu_gc_padd="eyJwaG9uZSI6IjhweCAxNXB4IDEwcHgifQ==" menu_bg="var(--news-hub-black)" menu_shadow_shadow_size="eyJwaG9uZSI6IjAifQ==" menu_arrow_color="rgba(0,0,0,0)" menu_uh_color="var(--news-hub-light-grey)" menu_uh_border_color="var(--news-hub-dark-grey)" menu_ul_link_color="var(--news-hub-white)" menu_ul_link_color_h="var(--news-hub-accent-hover)" menu_ul_sep_color="var(--news-hub-dark-grey)" menu_uf_txt_color="var(--news-hub-white)" menu_uf_txt_color_h="var(--news-hub-accent-hover)" menu_uf_border_color="var(--news-hub-dark-grey)" f_uh_font_size="eyJwaG9uZSI6IjEyIn0=" f_uh_font_line_height="eyJwaG9uZSI6IjEuMyJ9" f_uh_font_family="eyJwaG9uZSI6IjMyNSJ9" f_links_font_size="eyJwaG9uZSI6IjEyIn0=" f_links_font_line_height="eyJwaG9uZSI6IjEuMyJ9" f_links_font_family="eyJwaG9uZSI6IjMyNSJ9" f_uf_font_size="eyJwaG9uZSI6IjEyIn0=" f_uf_font_line_height="eyJwaG9uZSI6IjEuMyJ9" f_uf_font_family="eyJwaG9uZSI6IjMyNSJ9" f_gh_font_family="eyJwaG9uZSI6IjMyNSJ9" f_gh_font_size="eyJwaG9uZSI6IjEyIn0=" f_gh_font_line_height="eyJwaG9uZSI6IjEuMyJ9" f_btn1_font_family="eyJwaG9uZSI6IjMyNSJ9" f_btn1_font_weight="eyJwaG9uZSI6IjcwMCJ9" f_btn1_font_transform="eyJwaG9uZSI6InVwcGVyY2FzZSJ9" f_btn2_font_weight="eyJwaG9uZSI6IjcwMCJ9" f_btn2_font_transform="eyJwaG9uZSI6InVwcGVyY2FzZSJ9" f_btn2_font_family="eyJwaG9uZSI6IjMyNSJ9"]
10.3 C
New York
[tds_menu_login guest_tdicon="td-icon-profile" logout_tdicon="td-icon-log-out" tdc_css="eyJhbGwiOnsibWFyZ2luLWJvdHRvbSI6IjAiLCJkaXNwbGF5IjoiIn19" toggle_txt_color="var(--news-hub-white)" menu_offset_top="eyJhbGwiOiIxOSIsImxhbmRzY2FwZSI6IjE3IiwicG9ydHJhaXQiOiIxNSJ9" menu_offset_horiz="eyJhbGwiOi02LCJsYW5kc2NhcGUiOiItMyIsInBvcnRyYWl0IjoiLTIifQ==" menu_horiz_align="content-horiz-right" menu_bg="var(--news-hub-black)" menu_uh_color="var(--news-hub-light-grey)" menu_uh_border_color="var(--news-hub-dark-grey)" menu_ul_link_color="#ffffff" menu_ul_link_color_h="var(--news-hub-accent-hover)" menu_ul_sep_color="var(--news-hub-dark-grey)" menu_uf_txt_color="var(--news-hub-white)" menu_uf_txt_color_h="var(--news-hub-accent-hover)" menu_uf_border_color="var(--news-hub-dark-grey)" f_uh_font_family="325" f_uh_font_line_height="1.3" f_links_font_family="325" f_links_font_line_height="1.3" f_uf_font_line_height="1.3" f_uf_font_family="325" menu_uh_padd="eyJhbGwiOiIyMHB4IDI1cHggMThweCIsImxhbmRzY2FwZSI6IjE1cHggMjBweCAxM3B4IiwicG9ydHJhaXQiOiIxMHB4IDE1cHggOHB4In0=" menu_ul_padd="eyJhbGwiOiIxOHB4IDI1cHgiLCJsYW5kc2NhcGUiOiIxNnB4IDIwcHgiLCJwb3J0cmFpdCI6IjhweCAxNXB4In0=" menu_ul_space="eyJhbGwiOiIxMCIsImxhbmRzY2FwZSI6IjgiLCJwb3J0cmFpdCI6IjYifQ==" menu_ulo_padd="eyJhbGwiOiIxOHB4IDI1cHggMjBweCIsImxhbmRzY2FwZSI6IjEzcHggMjBweCAxNXB4IiwicG9ydHJhaXQiOiI4cHggMTVweCAxMHB4In0=" menu_shadow_shadow_size="0" menu_arrow_color="rgba(255,255,255,0)" menu_width="eyJhbGwiOiIyMjAiLCJwb3J0cmFpdCI6IjE4MCJ9" show_version="" menu_gh_padd="eyJhbGwiOiIyMHB4IDI1cHggMThweCIsImxhbmRzY2FwZSI6IjE1cHggMjBweCAxM3B4IiwicG9ydHJhaXQiOiIxMHB4IDE1cHggOHB4In0=" menu_gc_padd="eyJhbGwiOiIxOHB4IDI1cHggMjBweCIsImxhbmRzY2FwZSI6IjEzcHggMjBweCAxNXB4IiwicG9ydHJhaXQiOiI4cHggMTVweCAxMHB4In0=" menu_gh_color="var(--news-hub-light-grey)" menu_gh_border_color="var(--news-hub-dark-grey)" f_gh_font_family="325" menu_gc_btn1_bg_color="var(--news-hub-accent)" menu_gc_btn1_bg_color_h="var(--news-hub-accent-hover)" menu_gc_btn2_color="var(--news-hub-accent)" menu_gc_btn2_color_h="var(--news-hub-accent-hover)" f_btn1_font_family="325" f_btn1_font_transform="uppercase" f_btn2_font_family="325" f_btn2_font_transform="uppercase" f_btn1_font_weight="700" f_btn2_font_weight="700" show_menu="yes" f_uf_font_size="eyJsYW5kc2NhcGUiOiIxMiIsInBvcnRyYWl0IjoiMTIifQ==" icon_color="var(--news-hub-white)" icon_size="eyJhbGwiOjIyLCJsYW5kc2NhcGUiOiIyMCIsInBvcnRyYWl0IjoiMTgifQ==" avatar_size="eyJhbGwiOiIyMiIsImxhbmRzY2FwZSI6IjIwIiwicG9ydHJhaXQiOiIxOCJ9" ia_space="eyJhbGwiOiIxMCIsImxhbmRzY2FwZSI6IjgiLCJwb3J0cmFpdCI6IjYifQ==" f_toggle_font_family="325" f_toggle_font_size="eyJhbGwiOiIxNCIsImxhbmRzY2FwZSI6IjEzIiwicG9ydHJhaXQiOiIxMiJ9" logout_size="eyJhbGwiOjE0LCJsYW5kc2NhcGUiOiIxMyJ9" f_uh_font_size="eyJsYW5kc2NhcGUiOiIxMyIsInBvcnRyYWl0IjoiMTIifQ==" f_links_font_size="eyJsYW5kc2NhcGUiOiIxMyIsInBvcnRyYWl0IjoiMTIifQ==" f_gh_font_size="eyJsYW5kc2NhcGUiOiIxMyIsInBvcnRyYWl0IjoiMTIifQ=="]

Microsoft’s VASA-1 can deepfake a person with one photo and one audio track

Published:

A sample image from Microsoft for

Enlarge / A sample image from Microsoft for "VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time." (credit: Microsoft)

On Tuesday, Microsoft Research Asia unveiled VASA-1, an AI model that can create a synchronized animated video of a person talking or singing from a single photo and an existing audio track. In the future, it could power virtual avatars that render locally and don't require video feeds—or allow anyone with similar tools to take a photo of a person found online and make them appear to say whatever they want.

"It paves the way for real-time engagements with lifelike avatars that emulate human conversational behaviors," reads the abstract of the accompanying research paper titled, "VASA-1: Lifelike Audio-Driven Talking Faces Generated in Real Time." It's the work of Sicheng Xu, Guojun Chen, Yu-Xiao Guo, Jiaolong Yang, Chong Li, Zhenyu Zang, Yizhong Zhang, Xin Tong, and Baining Guo.

The VASA framework (short for "Visual Affective Skills Animator") uses machine learning to analyze a static image along with a speech audio clip. It is then able to generate a realistic video with precise facial expressions, head movements, and lip-syncing to the audio. It does not clone or simulate voices (like other Microsoft research) but relies on an existing audio input that could be specially recorded or spoken for a particular purpose.

Read 11 remaining paragraphs | Comments

Ars Technica - All contentContinue reading/original-link]

Related articles

spot_img

Recent articles

spot_img