three.ws/src/runtime/lipsync.js at main · nirholas/three.ws · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
// Lipsync driver — text-to-viseme heuristic. No audio analysis required.
// Returns { stop() } that resets all mouth morphs and cancels the animation loop.

const ARKIT_VISEMES = [
	'viseme_aa', 'viseme_CH', 'viseme_DD', 'viseme_E', 'viseme_FF',
	'viseme_I', 'viseme_kk', 'viseme_nn', 'viseme_O', 'viseme_PP',
	'viseme_RR', 'viseme_sil', 'viseme_SS', 'viseme_TH', 'viseme_U',
];

const JAW_FALLBACKS = ['jawOpen', 'mouthOpen'];

const CHAR_TO_VISEME = {
	a: 'viseme_aa', e: 'viseme_E', i: 'viseme_I', o: 'viseme_O', u: 'viseme_U',
	b: 'viseme_PP', m: 'viseme_PP', p: 'viseme_PP',
	f: 'viseme_FF', v: 'viseme_FF',
	d: 'viseme_DD', t: 'viseme_DD', n: 'viseme_DD', l: 'viseme_DD',
	k: 'viseme_kk', g: 'viseme_kk',
	s: 'viseme_SS', z: 'viseme_SS',
	r: 'viseme_RR',
};

const DIGRAPH_TO_VISEME = {
	th: 'viseme_TH',
	ch: 'viseme_CH',
	sh: 'viseme_CH',
};

const MS_PER_PHONEME = 80;
const WORD_GAP_MS = 120;
const PUNCT_GAP_MS = 200;
// Lerp factor per rAF step — reaches ~95% of target in ~5 frames at 60 fps
const LERP_FACTOR = 0.35;

function _buildMorphMap(root) {
	const arkit = new Map();
	const jaw = new Map();

	root.traverse?.((node) => {
		if (!node.isMesh || !node.morphTargetDictionary || !node.morphTargetInfluences) return;
		const dict = node.morphTargetDictionary;

		for (const name of ARKIT_VISEMES) {
			const idx = dict[name];
			if (idx === undefined) continue;
			if (!arkit.has(name)) arkit.set(name, []);
			arkit.get(name).push({ mesh: node, index: idx });
		}

		for (const name of JAW_FALLBACKS) {
			const idx = dict[name];
			if (idx === undefined) continue;
			if (!jaw.has(name)) jaw.set(name, []);
			jaw.get(name).push({ mesh: node, index: idx });
		}
	});

	if (arkit.size) return { mode: 'arkit', map: arkit };
	if (jaw.size) return { mode: 'jaw', map: jaw };
	return null;
}

function _tokenize(text) {
	const sequence = [];
	const lower = text.toLowerCase();
	let t = 0;
	let i = 0;

	while (i < lower.length) {
		const pair = i + 1 < lower.length ? lower[i] + lower[i + 1] : '';
		if (DIGRAPH_TO_VISEME[pair]) {
			sequence.push({ viseme: DIGRAPH_TO_VISEME[pair], startMs: t, endMs: t + MS_PER_PHONEME * 2 });
			t += MS_PER_PHONEME * 2;
			i += 2;
			continue;
		}

		const ch = lower[i];
		if (CHAR_TO_VISEME[ch]) {
			sequence.push({ viseme: CHAR_TO_VISEME[ch], startMs: t, endMs: t + MS_PER_PHONEME });
			t += MS_PER_PHONEME;
		} else if (ch === ' ' || ch === '\t' || ch === '\n') {
			t += WORD_GAP_MS;
		} else if (/[.,!?;:]/.test(ch)) {
			t += PUNCT_GAP_MS;
		}
		i++;
	}

	return sequence;
}

function _setMorph(targets, weight) {
	for (const { mesh, index } of targets) {
		mesh.morphTargetInfluences[index] = weight;
	}
}

function _resetMap(map) {
	for (const targets of map.values()) {
		_setMorph(targets, 0);
	}
}

// The viseme active at `elapsedMs` in a sorted [{viseme,startMs,endMs}] sequence,
// or null in the gaps. Linear scan — sequences are short (one utterance).
export function activeVisemeAt(sequence, elapsedMs) {
	for (const entry of sequence) {
		if (elapsedMs >= entry.startMs && elapsedMs < entry.endMs) return entry.viseme;
	}
	return null;
}

/**
 * Build a morph driver bound to an avatar's mouth/jaw blendshapes. Returns null
 * when the avatar has none. `step(targetViseme)` lerps one frame toward the given
 * viseme (or rest when null); `reset()` zeroes every morph. Shared by the text
 * heuristic ({@link startLipsync}) and the neural timestamp lane (neural-tts.js).
 *
 * @param {import('three').Object3D} root
 * @returns {{ mode:'arkit'|'jaw', step(target:string|null):void, reset():void }|null}
 */
export function createVisemeDriver(root) {
	const result = _buildMorphMap(root);
	if (!result) return null;
	const { mode, map } = result;
	const weights = new Map();
	for (const name of map.keys()) weights.set(name, 0);
	return {
		mode,
		step(targetViseme) {
			// 'arkit' rigs hit the exact viseme morph; 'jaw'-only rigs open the jaw
			// for any active viseme (a single morph can't shape phonemes).
			for (const [name, targets] of map.entries()) {
				const goal = mode === 'arkit' ? (name === targetViseme ? 1 : 0) : targetViseme ? 0.7 : 0;
				const cur = weights.get(name);
				const next = cur + (goal - cur) * LERP_FACTOR;
				weights.set(name, next);
				_setMorph(targets, next);
			}
		},
		reset() {
			for (const name of weights.keys()) weights.set(name, 0);
			_resetMap(map);
		},
	};
}

/**
 * Drive mouth morphs from a timed viseme sequence against a caller-supplied clock.
 * The clock decouples timing from wall-time: the text heuristic ticks on
 * `performance.now`, while the neural lane ticks on `audio.currentTime` so visemes
 * stay locked to playback even if the audio starts late or stalls.
 *
 * @param {import('three').Object3D} root
 * @param {Array<{viseme:string,startMs:number,endMs:number}>} sequence
 * @param {() => number} elapsedMs — current elapsed time, in ms
 * @returns {{ stop(): void }}
 */
export function playVisemeSequence(root, sequence, elapsedMs = () => 0) {
	const driver = createVisemeDriver(root);
	if (!driver || !sequence?.length) return { stop: () => {} };

	let rafId;
	let stopped = false;
	const tick = () => {
		if (stopped) return;
		driver.step(activeVisemeAt(sequence, elapsedMs()));
		rafId = requestAnimationFrame(tick);
	};
	rafId = requestAnimationFrame(tick);

	return {
		stop() {
			stopped = true;
			if (rafId !== undefined) cancelAnimationFrame(rafId);
			driver.reset();
		},
	};
}

/**
 * Drive jaw/mouth morph targets in sync with speech text via a phoneme heuristic.
 * Supports the ARKit viseme_* morph targets that three.ws avatars ship with (and
 * the same names exported by Mixamo / VRM), plus jawOpen / mouthOpen fallbacks.
 * Used when the TTS lane gives no real timing (browser SpeechSynthesis); the
 * neural lane uses {@link playVisemeSequence} with actual phoneme timestamps.
 *
 * @param {string} text — the spoken text
 * @param {import('three').Object3D} root — avatar root (or scene) to traverse for morph targets
 * @returns {{ stop(): void }}
 */
export function startLipsync(text, root) {
	if (!text || !root) return { stop: () => {} };
	const sequence = _tokenize(text);
	const t0 = performance.now();
	return playVisemeSequence(root, sequence, () => performance.now() - t0);
}