@article{M11981802, title = "Extending StarGAN-VC to Unseen Speakers Using RawNet3 Speaker Representation", journal = "The Transactions of the Korea Information Processing Society", year = "2023", issn = "null", doi = "https://doi.org/10.3745/KTSDE.2023.12.7.303", author = "Bogyung Park/Somin Park/Hyunki Hong", keywords = "Voice Conversion, Speaker Attribute, Generalization, StarGAN-VC, RawNet3", abstract = "Voice conversion, a technology that allows an individual’s speech data to be regenerated with the acoustic properties(tone, cadence, gender) of another, has countless applications in education, communication, and entertainment. This paper proposes an approach based on the StarGAN-VC model that generates realistic-sounding speech without requiring parallel utterances. To overcome the constraints of the existing StarGAN-VC model that utilizes one-hot vectors of original and target speaker information, this paper extracts feature vectors of target speakers using a pre-trained version of Rawnet3. This results in a latent space where voice conversion can be performed without direct speaker-to-speaker mappings, enabling an any-to-any structure. In addition to the loss terms used in the original StarGAN-VC model, Wasserstein distance is used as a loss term to ensure that generated voice segments match the acoustic properties of the target voice. Two Time-Scale Update Rule (TTUR) is also used to facilitate stable training. Experimental results show that the proposed method outperforms previous methods, including the StarGAN-VC network on which it was based." }