Extending eSpeak NG for singing synthesis.

To: debian-accessibility@lists.debian.org
Cc: paulliu@debian.org
Subject: Extending eSpeak NG for singing synthesis.
From: Tobias Platen <tplaten@posteo.de>
Date: Fri, 8 May 2020 18:59:29 +0200
Message-id: <[🔎] 20200508185929.1eef4d5eb228f992288fd634@posteo.de>

I've written a small patch for eSpeak NG which adds a singing synthesis mode similar to the singing-mode.scm included in Festival and used by the Singing Computer[1].
The patch adds two options for setting the note pitch in Hertz or as a midi note number. 
When singing mode is selected, eSpeak NG outputs the needed information for transforming speech to singing voice.

[1] https://freebsoft.org/singing-computer

-- 
Tobias Platen <tplaten@posteo.de>

diff --git a/src/espeak-ng.c b/src/espeak-ng.c
index 1634cc1d..d54e0ce8 100644
--- a/src/espeak-ng.c
+++ b/src/espeak-ng.c
@@ -324,6 +324,7 @@ int main(int argc, char **argv)
 		{ "compile-intonations", no_argument, 0, 0x10f },
 		{ "compile-phonemes", optional_argument, 0, 0x110 },
 		{ "load",    no_argument,       0, 0x111 },
+        { "utau-note", required_argument, 0, 0x112 },
 		{ 0, 0, 0, 0 }
 	};
 
@@ -368,7 +369,7 @@ int main(int argc, char **argv)
 	option_punctlist[0] = 0;
 
 	while (true) {
-		c = getopt_long(argc, argv, "a:b:d:f:g:hk:l:mp:qs:v:w:xXz",
+		c = getopt_long(argc, argv, "a:b:d:e:f:g:hk:l:mp:qs:v:u:w:xXz",
 		                long_options, &option_index);
 
 		// Detect the end of the options.
@@ -426,6 +427,12 @@ int main(int argc, char **argv)
 		case 'g':
 			wordgap = atoi(optarg2);
 			break;
+        case 'e':
+            espeak_sg_SetUtauNoteFreq(atoi(optarg2));
+            break;
+        case 'u':
+            espeak_sg_SetUtauNote(atoi(optarg2));
+            break;
 		case 'v':
 			strncpy0(voicename, optarg2, sizeof(voicename));
 			break;
diff --git a/src/include/espeak-ng/speak_lib.h b/src/include/espeak-ng/speak_lib.h
index cc1e35e7..fba10c4f 100644
--- a/src/include/espeak-ng/speak_lib.h
+++ b/src/include/espeak-ng/speak_lib.h
@@ -468,6 +468,9 @@ ESPEAK_API espeak_ERROR espeak_SetParameter(espeak_PARAMETER parameter, int valu
 	   EE_INTERNAL_ERROR.
 */
 
+ESPEAK_API void espeak_sg_SetUtauNote(int value);
+ESPEAK_API void espeak_sg_SetUtauNoteFreq(int value);
+
 #ifdef __cplusplus
 extern "C"
 #endif
diff --git a/src/libespeak-ng/speech.c b/src/libespeak-ng/speech.c
index b415e537..ccf7bedb 100644
--- a/src/libespeak-ng/speech.c
+++ b/src/libespeak-ng/speech.c
@@ -796,6 +796,18 @@ ESPEAK_API int espeak_GetParameter(espeak_PARAMETER parameter, int current)
 	return param_defaults[parameter];
 }
 
+extern int utau_pitch;
+
+ESPEAK_API void espeak_sg_SetUtauNote(int value)
+{
+    utau_pitch = pow(2.0, (value - 69) / 12.0) * 440;
+}
+
+ESPEAK_API void espeak_sg_SetUtauNoteFreq(int value)
+{
+    utau_pitch = value;
+}
+
 ESPEAK_NG_API espeak_ng_STATUS espeak_ng_SetParameter(espeak_PARAMETER parameter, int value, int relative)
 {
 #ifdef USE_ASYNC
diff --git a/src/libespeak-ng/synthesize.c b/src/libespeak-ng/synthesize.c
index b274ae42..eb4583e2 100644
--- a/src/libespeak-ng/synthesize.c
+++ b/src/libespeak-ng/synthesize.c
@@ -223,6 +223,15 @@ static void DoPause(int length, int control)
 	}
 }
 
+
+static void DoOto(char* oto, int type)
+{
+    wcmdq[wcmdq_tail][0] = WCMD_OTO;
+	wcmdq[wcmdq_tail][1] = oto;
+	wcmdq[wcmdq_tail][2] = type;
+	WcmdqInc();
+}
+
 extern int seq_len_adjust; // temporary fix to advance the start point for playing the wav sample
 
 static int DoSample2(int index, int which, int std_length, int control, int length_mod, int amp)
@@ -1131,6 +1140,8 @@ void DoEmbedded(int *embix, int sourceix)
 	} while ((word & 0x80) == 0);
 }
 
+extern int utau_pitch;
+
 int Generate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume)
 {
 	static int ix;
@@ -1187,6 +1198,14 @@ int Generate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume)
 
 	while ((ix < (*n_ph)) && (ix < N_PHONEME_LIST-2)) {
 		p = &phoneme_list[ix];
+        
+        if(utau_pitch)
+		{
+			char ecantorix_debug_buf[30];
+			int ecantorix_debug_flags=0;
+			WritePhMnemonic(ecantorix_debug_buf, p->ph, p, 0, &ecantorix_debug_flags);
+			DoOto(strdup(ecantorix_debug_buf),p->type);
+		}
 
 		if (p->type == phPAUSE)
 			free_min = 10;
@@ -1236,6 +1255,8 @@ int Generate(PHONEME_LIST *phoneme_list, int *n_ph, bool resume)
 			}
 		}
 
+        //emit oto here
+        
 		switch (p->type)
 		{
 		case phPAUSE:
diff --git a/src/libespeak-ng/synthesize.h b/src/libespeak-ng/synthesize.h
index d7760f74..f9d32bd6 100644
--- a/src/libespeak-ng/synthesize.h
+++ b/src/libespeak-ng/synthesize.h
@@ -433,6 +433,7 @@ extern unsigned char pitch_adjust_tab[MAX_PITCH_VALUE+1];
 #define WCMD_MBROLA_DATA 13
 #define WCMD_FMT_AMPLITUDE 14
 #define WCMD_SONIC_SPEED 15
+#define WCMD_OTO 16
 
 #define N_WCMDQ   170
 #define MIN_WCMDQ  25   // need this many free entries before adding new phoneme
diff --git a/src/libespeak-ng/wavegen.c b/src/libespeak-ng/wavegen.c
index 13efb743..d3de98bd 100644
--- a/src/libespeak-ng/wavegen.c
+++ b/src/libespeak-ng/wavegen.c
@@ -53,6 +53,8 @@
 
 voice_t *wvoice = NULL;
 
+int utau_pitch=0;
+
 FILE *f_log = NULL;
 static int option_harmonic1 = 10;
 static int flutter_amp = 64;
@@ -324,6 +326,18 @@ static unsigned char pk_shape2[PEAKSHAPEW+1] = {
 
 static unsigned char *pk_shape;
 
+static void printUtauHeader()
+{
+    if(utau_pitch==0) return;
+    static int done=0;
+    if(done==0) {
+        printf("SG_SAMPLERATE %i\n",samplerate);
+        float period = samplerate*1.0/utau_pitch;
+        printf("SG_PERIOD %f\n",period);
+    }
+    done=1;
+}
+
 void WavegenInit(int rate, int wavemult_fact)
 {
 	int ix;
@@ -530,6 +544,7 @@ int PeaksToHarmspect(wavegen_peaks_t *peaks, int pitch, int *htab, int control)
 	return hmax; // highest harmonic number
 }
 
+
 static void AdvanceParameters()
 {
 	// Called every 64 samples to increment the formant freq, height, and widths
@@ -554,6 +569,12 @@ static void AdvanceParameters()
 	x = ((int)(Flutter_tab[Flutter_ix >> 6])-0x80) * flutter_amp;
 	Flutter_ix += Flutter_inc;
 	wdata.pitch += x;
+    
+    if(utau_pitch)
+    {
+        wdata.pitch = (utau_pitch<<12) + x;
+    }
+    
 	if (wdata.pitch < 102400)
 		wdata.pitch = 102400; // min pitch, 25 Hz  (25 << 12)
 
@@ -671,6 +692,8 @@ static int ApplyBreath(void)
 	return value;
 }
 
+
+
 static int Wavegen()
 {
 	if (wvoice == NULL)
@@ -788,7 +811,7 @@ static int Wavegen()
 					modn_period = modn_period >> 4;
 				}
 
-				if (modn_period != 0) {
+				if (modn_period != 0 && utau_pitch==0) {
 					if (modn_period == 0xf) {
 						// just once */
 						amplitude2 = (amplitude2 * modn_amp)/16;
@@ -826,6 +849,7 @@ static int Wavegen()
 		total += AddSineWaves(waveph, h_switch_sign, maxh, harmspect);  // call an assembler code routine
 #else
 		theta = waveph;
+        
 
 		for (h = 1; h <= h_switch_sign; h++) {
 			total += ((int)sin_tab[theta >> 5] * harmspect[h]);
@@ -863,12 +887,20 @@ static int Wavegen()
 				wdata.mix_wavefile_offset -= (wdata.mix_wavefile_max*3)/4;
 		}
 
-		z1 = z2 + (((total>>8) * amplitude2) >> 13);
+        int z3 = (((total>>8) * amplitude2) >> 13);
+        
+		z1 = z2 + z3;   //mixed = unvoiced + voiced
+       
+        
 
 		echo = (echo_buf[echo_tail++] * echo_amp);
 		z1 += echo >> 8;
 		if (echo_tail >= N_ECHO_BUF)
 			echo_tail = 0;
+            
+        printUtauHeader();
+        if(utau_pitch) printf("SG_V %i %i\n",z3,z2); //we have more bits in text files
+        
 
 		z = (z1 * agc) >> 8;
 
@@ -884,6 +916,7 @@ static int Wavegen()
 		}
 		*out_ptr++ = z;
 		*out_ptr++ = z >> 8;
+        
 
 		echo_buf[echo_head++] = z;
 		if (echo_head >= N_ECHO_BUF)
@@ -915,6 +948,8 @@ static int PlaySilence(int length, bool resume)
 		if (echo_tail >= N_ECHO_BUF)
 			echo_tail = 0;
 
+        printUtauHeader();
+        if(utau_pitch) printf("SG_S %i\n",value);
 		*out_ptr++ = value;
 		*out_ptr++ = value >> 8;
 
@@ -967,6 +1002,8 @@ static int PlayWave(int length, bool resume, unsigned char *data, int scale, int
 		if (echo_tail >= N_ECHO_BUF)
 			echo_tail = 0;
 
+        printUtauHeader();
+        if(utau_pitch) printf("SG_U %i\n",value);
 		out_ptr[0] = value;
 		out_ptr[1] = value >> 8;
 		out_ptr += 2;
@@ -1281,6 +1318,15 @@ static int WavegenFill2()
 
 		switch (q[0] & 0xff)
 		{
+        case WCMD_OTO:
+        {
+            char* data = (char*)q[1];
+            printUtauHeader();
+            char* ototypes[]= {"PAUSE","STRESS","VOWEL","LIQUID","STOP","VSTOP","FRICATIVE","VFRICATIVE","NASAL","VIRTUAL","DELETED","INVALID"};
+			printf("SG_OTO %s %s\n",data,ototypes[q[2]]);
+            free(data);
+        }
+        break;
 		case WCMD_PITCH:
 			SetPitch(length, (unsigned char *)q[2], q[3] >> 16, q[3] & 0xffff);
 			break;

Reply to:

Follow-Ups:
- Re: Extending eSpeak NG for singing synthesis.
  - From: Samuel Thibault <sthibault@debian.org>

Prev by Date: Re: Off topic, Linux shop
Next by Date: Re: Extending eSpeak NG for singing synthesis.
Previous by thread: Re: Off topic, Linux shop
Next by thread: Re: Extending eSpeak NG for singing synthesis.
Index(es):
- Date
- Thread