@article{M867DF095,
title = "Extending StarGAN-VC to Unseen Speakers Using RawNet3 Speaker Representation",
journal = "The Transactions of the Korea Information Processing Society",
year = "2023",
issn = "null",
doi = "https://doi.org/10.3745/KTSDE.2023.12.7.303",
author = "Bogyung Park/Somin Park/Hyunki Hong",
keywords = "Voice Conversion, Speaker Attribute, Generalization, StarGAN-VC, RawNet3",
abstract = "Voice conversion, a technology that allows an individual’s speech data to be regenerated with the acoustic properties(tone, cadence, 
gender) of another, has countless applications in education, communication, and entertainment. This paper proposes an approach based 
on the StarGAN-VC model that generates realistic-sounding speech without requiring parallel utterances. To overcome the constraints 
of the existing StarGAN-VC model that utilizes one-hot vectors of original and target speaker information, this paper extracts feature 
vectors of target speakers using a pre-trained version of Rawnet3. This results in a latent space where voice conversion can be performed 
without direct speaker-to-speaker mappings, enabling an any-to-any structure. In addition to the loss terms used in the original 
StarGAN-VC model, Wasserstein distance is used as a loss term to ensure that generated voice segments match the acoustic properties 
of the target voice. Two Time-Scale Update Rule (TTUR) is also used to facilitate stable training. Experimental results show that the 
proposed method outperforms previous methods, including the StarGAN-VC network on which it was based."
}